blob: ac77114da97bc0a73e28f189bee43ace395d724a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200378 void *data;
379 Py_UCS4 ch;
380
381 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 for (i=0; i < ascii->length; i++)
383 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200384 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200385 if (ch > maxchar)
386 maxchar = ch;
387 }
388 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100389 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100391 assert(maxchar <= 255);
392 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 else
394 assert(maxchar < 128);
395 }
Victor Stinner77faf692011-11-20 18:56:05 +0100396 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100398 assert(maxchar <= 0xFFFF);
399 }
400 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100402 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200404 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400406 return 1;
407}
Victor Stinner910337b2011-10-03 03:20:16 +0200408#endif
409
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100410static PyObject*
411unicode_result_wchar(PyObject *unicode)
412{
413#ifndef Py_DEBUG
414 Py_ssize_t len;
415
416 assert(Py_REFCNT(unicode) == 1);
417
418 len = _PyUnicode_WSTR_LENGTH(unicode);
419 if (len == 0) {
420 Py_INCREF(unicode_empty);
421 Py_DECREF(unicode);
422 return unicode_empty;
423 }
424
425 if (len == 1) {
426 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
427 if (ch < 256) {
428 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
429 Py_DECREF(unicode);
430 return latin1_char;
431 }
432 }
433
434 if (_PyUnicode_Ready(unicode) < 0) {
435 Py_XDECREF(unicode);
436 return NULL;
437 }
438#else
439 /* don't make the result ready in debug mode to ensure that the caller
440 makes the string ready before using it */
441 assert(_PyUnicode_CheckConsistency(unicode, 1));
442#endif
443 return unicode;
444}
445
446static PyObject*
447unicode_result_ready(PyObject *unicode)
448{
449 Py_ssize_t length;
450
451 length = PyUnicode_GET_LENGTH(unicode);
452 if (length == 0) {
453 if (unicode != unicode_empty) {
454 Py_INCREF(unicode_empty);
455 Py_DECREF(unicode);
456 }
457 return unicode_empty;
458 }
459
460 if (length == 1) {
461 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
462 if (ch < 256) {
463 PyObject *latin1_char = unicode_latin1[ch];
464 if (latin1_char != NULL) {
465 if (unicode != latin1_char) {
466 Py_INCREF(latin1_char);
467 Py_DECREF(unicode);
468 }
469 return latin1_char;
470 }
471 else {
472 assert(_PyUnicode_CheckConsistency(unicode, 1));
473 Py_INCREF(unicode);
474 unicode_latin1[ch] = unicode;
475 return unicode;
476 }
477 }
478 }
479
480 assert(_PyUnicode_CheckConsistency(unicode, 1));
481 return unicode;
482}
483
484static PyObject*
485unicode_result(PyObject *unicode)
486{
487 assert(_PyUnicode_CHECK(unicode));
488 if (PyUnicode_IS_READY(unicode))
489 return unicode_result_ready(unicode);
490 else
491 return unicode_result_wchar(unicode);
492}
493
Victor Stinnerc4b49542011-12-11 22:44:26 +0100494static PyObject*
495unicode_result_unchanged(PyObject *unicode)
496{
497 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500498 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499 return NULL;
500 Py_INCREF(unicode);
501 return unicode;
502 }
503 else
504 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100505 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100506}
507
Victor Stinner3a50e702011-10-18 21:21:00 +0200508#ifdef HAVE_MBCS
509static OSVERSIONINFOEX winver;
510#endif
511
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512/* --- Bloom Filters ----------------------------------------------------- */
513
514/* stuff to implement simple "bloom filters" for Unicode characters.
515 to keep things simple, we use a single bitmask, using the least 5
516 bits from each unicode characters as the bit index. */
517
518/* the linebreak mask is set up by Unicode_Init below */
519
Antoine Pitrouf068f942010-01-13 14:19:12 +0000520#if LONG_BIT >= 128
521#define BLOOM_WIDTH 128
522#elif LONG_BIT >= 64
523#define BLOOM_WIDTH 64
524#elif LONG_BIT >= 32
525#define BLOOM_WIDTH 32
526#else
527#error "LONG_BIT is smaller than 32"
528#endif
529
Thomas Wouters477c8d52006-05-27 19:21:47 +0000530#define BLOOM_MASK unsigned long
531
532static BLOOM_MASK bloom_linebreak;
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
535#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Benjamin Peterson29060642009-01-31 22:14:21 +0000537#define BLOOM_LINEBREAK(ch) \
538 ((ch) < 128U ? ascii_linebreak[(ch)] : \
539 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540
Alexander Belopolsky40018472011-02-26 01:02:56 +0000541Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543{
544 /* calculate simple bloom-style bitmask for a given unicode string */
545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547 Py_ssize_t i;
548
549 mask = 0;
550 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552
553 return mask;
554}
555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556#define BLOOM_MEMBER(mask, chr, str) \
557 (BLOOM(mask, chr) \
558 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000559
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200560/* Compilation of templated routines */
561
562#include "stringlib/asciilib.h"
563#include "stringlib/fastsearch.h"
564#include "stringlib/partition.h"
565#include "stringlib/split.h"
566#include "stringlib/count.h"
567#include "stringlib/find.h"
568#include "stringlib/find_max_char.h"
569#include "stringlib/localeutil.h"
570#include "stringlib/undef.h"
571
572#include "stringlib/ucs1lib.h"
573#include "stringlib/fastsearch.h"
574#include "stringlib/partition.h"
575#include "stringlib/split.h"
576#include "stringlib/count.h"
577#include "stringlib/find.h"
578#include "stringlib/find_max_char.h"
579#include "stringlib/localeutil.h"
580#include "stringlib/undef.h"
581
582#include "stringlib/ucs2lib.h"
583#include "stringlib/fastsearch.h"
584#include "stringlib/partition.h"
585#include "stringlib/split.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
588#include "stringlib/find_max_char.h"
589#include "stringlib/localeutil.h"
590#include "stringlib/undef.h"
591
592#include "stringlib/ucs4lib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602#include "stringlib/unicodedefs.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/count.h"
605#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100606#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608/* --- Unicode Object ----------------------------------------------------- */
609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200611fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
614 Py_ssize_t size, Py_UCS4 ch,
615 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200617 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
618
619 switch (kind) {
620 case PyUnicode_1BYTE_KIND:
621 {
622 Py_UCS1 ch1 = (Py_UCS1) ch;
623 if (ch1 == ch)
624 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
625 else
626 return -1;
627 }
628 case PyUnicode_2BYTE_KIND:
629 {
630 Py_UCS2 ch2 = (Py_UCS2) ch;
631 if (ch2 == ch)
632 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
633 else
634 return -1;
635 }
636 case PyUnicode_4BYTE_KIND:
637 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
638 default:
639 assert(0);
640 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642}
643
Victor Stinnerfe226c02011-10-03 03:52:20 +0200644static PyObject*
645resize_compact(PyObject *unicode, Py_ssize_t length)
646{
647 Py_ssize_t char_size;
648 Py_ssize_t struct_size;
649 Py_ssize_t new_size;
650 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100651 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100653 assert(PyUnicode_IS_COMPACT(unicode));
654
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200655 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100656 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 struct_size = sizeof(PyASCIIObject);
658 else
659 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 new_size = (struct_size + (length + 1) * char_size);
667
Victor Stinner84def372011-12-11 20:04:56 +0100668 _Py_DEC_REFTOTAL;
669 _Py_ForgetReference(unicode);
670
671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100673 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyErr_NoMemory();
675 return NULL;
676 }
Victor Stinner84def372011-12-11 20:04:56 +0100677 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200681 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100683 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200684 _PyUnicode_WSTR_LENGTH(unicode) = length;
685 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687 length, 0);
688 return unicode;
689}
690
Alexander Belopolsky40018472011-02-26 01:02:56 +0000691static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200692resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693{
Victor Stinner95663112011-10-04 01:03:50 +0200694 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100695 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200696 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000698
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 if (PyUnicode_IS_READY(unicode)) {
700 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200701 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 void *data;
703
704 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200705 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200706 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
707 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708
709 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
710 PyErr_NoMemory();
711 return -1;
712 }
713 new_size = (length + 1) * char_size;
714
Victor Stinner7a9105a2011-12-12 00:13:42 +0100715 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
716 {
717 PyObject_DEL(_PyUnicode_UTF8(unicode));
718 _PyUnicode_UTF8(unicode) = NULL;
719 _PyUnicode_UTF8_LENGTH(unicode) = 0;
720 }
721
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722 data = (PyObject *)PyObject_REALLOC(data, new_size);
723 if (data == NULL) {
724 PyErr_NoMemory();
725 return -1;
726 }
727 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_WSTR_LENGTH(unicode) = length;
731 }
732 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200733 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200734 _PyUnicode_UTF8_LENGTH(unicode) = length;
735 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_LENGTH(unicode) = length;
737 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200738 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200739 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 }
Victor Stinner95663112011-10-04 01:03:50 +0200743 assert(_PyUnicode_WSTR(unicode) != NULL);
744
745 /* check for integer overflow */
746 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
747 PyErr_NoMemory();
748 return -1;
749 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100750 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200751 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100752 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200753 if (!wstr) {
754 PyErr_NoMemory();
755 return -1;
756 }
757 _PyUnicode_WSTR(unicode) = wstr;
758 _PyUnicode_WSTR(unicode)[length] = 0;
759 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200760 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 return 0;
762}
763
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764static PyObject*
765resize_copy(PyObject *unicode, Py_ssize_t length)
766{
767 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100770
Benjamin Petersonbac79492012-01-14 13:34:47 -0500771 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100772 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
775 if (copy == NULL)
776 return NULL;
777
778 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200779 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200781 }
782 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200783 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100784
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200785 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200786 if (w == NULL)
787 return NULL;
788 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
789 copy_length = Py_MIN(copy_length, length);
790 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
791 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200792 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 }
794}
795
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000797 Ux0000 terminated; some code (e.g. new_identifier)
798 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000799
800 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000801 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802
803*/
804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200806static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807#endif
808
Alexander Belopolsky40018472011-02-26 01:02:56 +0000809static PyUnicodeObject *
810_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811{
812 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814
Thomas Wouters477c8d52006-05-27 19:21:47 +0000815 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816 if (length == 0 && unicode_empty != NULL) {
817 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200818 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819 }
820
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000821 /* Ensure we won't overflow the size. */
822 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
823 return (PyUnicodeObject *)PyErr_NoMemory();
824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825 if (length < 0) {
826 PyErr_SetString(PyExc_SystemError,
827 "Negative size passed to _PyUnicode_New");
828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 }
830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831#ifdef Py_DEBUG
832 ++unicode_old_new_calls;
833#endif
834
835 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
836 if (unicode == NULL)
837 return NULL;
838 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
839 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
840 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100841 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000842 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100843 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845
Jeremy Hyltond8082792003-09-16 19:41:39 +0000846 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000847 * the caller fails before initializing str -- unicode_resize()
848 * reads str[0], and the Keep-Alive optimization can keep memory
849 * allocated for str alive across a call to unicode_dealloc(unicode).
850 * We don't want unicode_resize to read uninitialized memory in
851 * that case.
852 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853 _PyUnicode_WSTR(unicode)[0] = 0;
854 _PyUnicode_WSTR(unicode)[length] = 0;
855 _PyUnicode_WSTR_LENGTH(unicode) = length;
856 _PyUnicode_HASH(unicode) = -1;
857 _PyUnicode_STATE(unicode).interned = 0;
858 _PyUnicode_STATE(unicode).kind = 0;
859 _PyUnicode_STATE(unicode).compact = 0;
860 _PyUnicode_STATE(unicode).ready = 0;
861 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200862 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200863 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200864 _PyUnicode_UTF8(unicode) = NULL;
865 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100866 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867 return unicode;
868}
869
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870static const char*
871unicode_kind_name(PyObject *unicode)
872{
Victor Stinner42dfd712011-10-03 14:41:45 +0200873 /* don't check consistency: unicode_kind_name() is called from
874 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200875 if (!PyUnicode_IS_COMPACT(unicode))
876 {
877 if (!PyUnicode_IS_READY(unicode))
878 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600879 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200880 {
881 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200882 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200883 return "legacy ascii";
884 else
885 return "legacy latin1";
886 case PyUnicode_2BYTE_KIND:
887 return "legacy UCS2";
888 case PyUnicode_4BYTE_KIND:
889 return "legacy UCS4";
890 default:
891 return "<legacy invalid kind>";
892 }
893 }
894 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600895 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 return "ascii";
899 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200902 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200904 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 default:
906 return "<invalid compact kind>";
907 }
908}
909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200911static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912
913/* Functions wrapping macros for use in debugger */
914char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200915 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200916}
917
918void *_PyUnicode_compact_data(void *unicode) {
919 return _PyUnicode_COMPACT_DATA(unicode);
920}
921void *_PyUnicode_data(void *unicode){
922 printf("obj %p\n", unicode);
923 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
924 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
925 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
926 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
927 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
928 return PyUnicode_DATA(unicode);
929}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200930
931void
932_PyUnicode_Dump(PyObject *op)
933{
934 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
936 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
937 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200938
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200940 {
941 if (ascii->state.ascii)
942 data = (ascii + 1);
943 else
944 data = (compact + 1);
945 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 else
947 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200948 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
949
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 if (ascii->wstr == data)
951 printf("shared ");
952 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera3b334d2011-10-03 13:53:37 +0200954 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 printf(" (%zu), ", compact->wstr_length);
956 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
957 printf("shared ");
958 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200959 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200961}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962#endif
963
964PyObject *
965PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
966{
967 PyObject *obj;
968 PyCompactUnicodeObject *unicode;
969 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200970 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200971 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 Py_ssize_t char_size;
973 Py_ssize_t struct_size;
974
975 /* Optimization for empty strings */
976 if (size == 0 && unicode_empty != NULL) {
977 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200978 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
980
981#ifdef Py_DEBUG
982 ++unicode_new_new_calls;
983#endif
984
Victor Stinner9e9d6892011-10-04 01:02:02 +0200985 is_ascii = 0;
986 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 struct_size = sizeof(PyCompactUnicodeObject);
988 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200989 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 char_size = 1;
991 is_ascii = 1;
992 struct_size = sizeof(PyASCIIObject);
993 }
994 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +0200995 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996 char_size = 1;
997 }
998 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +0200999 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000 char_size = 2;
1001 if (sizeof(wchar_t) == 2)
1002 is_sharing = 1;
1003 }
1004 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001005 if (maxchar > MAX_UNICODE) {
1006 PyErr_SetString(PyExc_SystemError,
1007 "invalid maximum character passed to PyUnicode_New");
1008 return NULL;
1009 }
Victor Stinner8f825062012-04-27 13:55:39 +02001010 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 char_size = 4;
1012 if (sizeof(wchar_t) == 4)
1013 is_sharing = 1;
1014 }
1015
1016 /* Ensure we won't overflow the size. */
1017 if (size < 0) {
1018 PyErr_SetString(PyExc_SystemError,
1019 "Negative size passed to PyUnicode_New");
1020 return NULL;
1021 }
1022 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1023 return PyErr_NoMemory();
1024
1025 /* Duplicated allocation code from _PyObject_New() instead of a call to
1026 * PyObject_New() so we are able to allocate space for the object and
1027 * it's data buffer.
1028 */
1029 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1030 if (obj == NULL)
1031 return PyErr_NoMemory();
1032 obj = PyObject_INIT(obj, &PyUnicode_Type);
1033 if (obj == NULL)
1034 return NULL;
1035
1036 unicode = (PyCompactUnicodeObject *)obj;
1037 if (is_ascii)
1038 data = ((PyASCIIObject*)obj) + 1;
1039 else
1040 data = unicode + 1;
1041 _PyUnicode_LENGTH(unicode) = size;
1042 _PyUnicode_HASH(unicode) = -1;
1043 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001044 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 _PyUnicode_STATE(unicode).compact = 1;
1046 _PyUnicode_STATE(unicode).ready = 1;
1047 _PyUnicode_STATE(unicode).ascii = is_ascii;
1048 if (is_ascii) {
1049 ((char*)data)[size] = 0;
1050 _PyUnicode_WSTR(unicode) = NULL;
1051 }
Victor Stinner8f825062012-04-27 13:55:39 +02001052 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 ((char*)data)[size] = 0;
1054 _PyUnicode_WSTR(unicode) = NULL;
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001057 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 else {
1060 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001061 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001062 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001064 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 ((Py_UCS4*)data)[size] = 0;
1066 if (is_sharing) {
1067 _PyUnicode_WSTR_LENGTH(unicode) = size;
1068 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1069 }
1070 else {
1071 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1072 _PyUnicode_WSTR(unicode) = NULL;
1073 }
1074 }
Victor Stinner8f825062012-04-27 13:55:39 +02001075#ifdef Py_DEBUG
1076 /* Fill the data with invalid characters to detect bugs earlier.
1077 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1078 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1079 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1080 memset(data, 0xff, size * kind);
1081#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001082 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083 return obj;
1084}
1085
1086#if SIZEOF_WCHAR_T == 2
1087/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1088 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001089 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090
1091 This function assumes that unicode can hold one more code point than wstr
1092 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001093static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001095 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096{
1097 const wchar_t *iter;
1098 Py_UCS4 *ucs4_out;
1099
Victor Stinner910337b2011-10-03 03:20:16 +02001100 assert(unicode != NULL);
1101 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1103 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1104
1105 for (iter = begin; iter < end; ) {
1106 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1107 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001108 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1109 && (iter+1) < end
1110 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 {
Victor Stinner551ac952011-11-29 22:58:13 +01001112 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 iter += 2;
1114 }
1115 else {
1116 *ucs4_out++ = *iter;
1117 iter++;
1118 }
1119 }
1120 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1121 _PyUnicode_GET_LENGTH(unicode)));
1122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123}
1124#endif
1125
Victor Stinnercd9950f2011-10-02 00:34:53 +02001126static int
Victor Stinner488fa492011-12-12 00:01:39 +01001127unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001128{
Victor Stinner488fa492011-12-12 00:01:39 +01001129 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001130 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001131 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001132 return -1;
1133 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001134 return 0;
1135}
1136
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001137static int
1138_copy_characters(PyObject *to, Py_ssize_t to_start,
1139 PyObject *from, Py_ssize_t from_start,
1140 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001142 unsigned int from_kind, to_kind;
1143 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001146 assert(PyUnicode_Check(from));
1147 assert(PyUnicode_Check(to));
1148 assert(PyUnicode_IS_READY(from));
1149 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1152 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1153 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001155 if (how_many == 0)
1156 return 0;
1157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001159 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001161 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001163#ifdef Py_DEBUG
1164 if (!check_maxchar
1165 && (from_kind > to_kind
1166 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001168 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1169 Py_UCS4 ch;
1170 Py_ssize_t i;
1171 for (i=0; i < how_many; i++) {
1172 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1173 assert(ch <= to_maxchar);
1174 }
1175 }
1176#endif
1177 fast = (from_kind == to_kind);
1178 if (check_maxchar
1179 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1180 {
1181 /* deny latin1 => ascii */
1182 fast = 0;
1183 }
1184
1185 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001186 Py_MEMCPY((char*)to_data + to_kind * to_start,
1187 (char*)from_data + from_kind * from_start,
1188 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001190 else if (from_kind == PyUnicode_1BYTE_KIND
1191 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 {
1193 _PyUnicode_CONVERT_BYTES(
1194 Py_UCS1, Py_UCS2,
1195 PyUnicode_1BYTE_DATA(from) + from_start,
1196 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1197 PyUnicode_2BYTE_DATA(to) + to_start
1198 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001199 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001200 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001201 && to_kind == PyUnicode_4BYTE_KIND)
1202 {
1203 _PyUnicode_CONVERT_BYTES(
1204 Py_UCS1, Py_UCS4,
1205 PyUnicode_1BYTE_DATA(from) + from_start,
1206 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1207 PyUnicode_4BYTE_DATA(to) + to_start
1208 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001209 }
1210 else if (from_kind == PyUnicode_2BYTE_KIND
1211 && to_kind == PyUnicode_4BYTE_KIND)
1212 {
1213 _PyUnicode_CONVERT_BYTES(
1214 Py_UCS2, Py_UCS4,
1215 PyUnicode_2BYTE_DATA(from) + from_start,
1216 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1217 PyUnicode_4BYTE_DATA(to) + to_start
1218 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001219 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001221 /* check if max_char(from substring) <= max_char(to) */
1222 if (from_kind > to_kind
1223 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001224 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001225 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001226 /* slow path to check for character overflow */
1227 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001228 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001229 Py_ssize_t i;
1230
Victor Stinner56c161a2011-10-06 02:47:11 +02001231#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001232 for (i=0; i < how_many; i++) {
1233 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001234 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001235 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1236 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001237#else
1238 if (!check_maxchar) {
1239 for (i=0; i < how_many; i++) {
1240 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1241 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1242 }
1243 }
1244 else {
1245 for (i=0; i < how_many; i++) {
1246 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1247 if (ch > to_maxchar)
1248 return 1;
1249 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1250 }
1251 }
1252#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001253 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001254 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001255 assert(0 && "inconsistent state");
1256 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001257 }
1258 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001259 return 0;
1260}
1261
1262static void
1263copy_characters(PyObject *to, Py_ssize_t to_start,
1264 PyObject *from, Py_ssize_t from_start,
1265 Py_ssize_t how_many)
1266{
1267 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1268}
1269
1270Py_ssize_t
1271PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1272 PyObject *from, Py_ssize_t from_start,
1273 Py_ssize_t how_many)
1274{
1275 int err;
1276
1277 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1278 PyErr_BadInternalCall();
1279 return -1;
1280 }
1281
Benjamin Petersonbac79492012-01-14 13:34:47 -05001282 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001283 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001284 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001285 return -1;
1286
1287 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1288 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1289 PyErr_Format(PyExc_SystemError,
1290 "Cannot write %zi characters at %zi "
1291 "in a string of %zi characters",
1292 how_many, to_start, PyUnicode_GET_LENGTH(to));
1293 return -1;
1294 }
1295
1296 if (how_many == 0)
1297 return 0;
1298
Victor Stinner488fa492011-12-12 00:01:39 +01001299 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001300 return -1;
1301
1302 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1303 if (err) {
1304 PyErr_Format(PyExc_SystemError,
1305 "Cannot copy %s characters "
1306 "into a string of %s characters",
1307 unicode_kind_name(from),
1308 unicode_kind_name(to));
1309 return -1;
1310 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001311 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312}
1313
Victor Stinner17222162011-09-28 22:15:37 +02001314/* Find the maximum code point and count the number of surrogate pairs so a
1315 correct string length can be computed before converting a string to UCS4.
1316 This function counts single surrogates as a character and not as a pair.
1317
1318 Return 0 on success, or -1 on error. */
1319static int
1320find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1321 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322{
1323 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001324 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325
Victor Stinnerc53be962011-10-02 21:33:54 +02001326 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 *num_surrogates = 0;
1328 *maxchar = 0;
1329
1330 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001332 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1333 && (iter+1) < end
1334 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001336 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 iter += 2;
1339 }
1340 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001342 {
1343 ch = *iter;
1344 iter++;
1345 }
1346 if (ch > *maxchar) {
1347 *maxchar = ch;
1348 if (*maxchar > MAX_UNICODE) {
1349 PyErr_Format(PyExc_ValueError,
1350 "character U+%x is not in range [U+0000; U+10ffff]",
1351 ch);
1352 return -1;
1353 }
1354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 }
1356 return 0;
1357}
1358
1359#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001360static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361#endif
1362
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001363int
1364_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365{
1366 wchar_t *end;
1367 Py_UCS4 maxchar = 0;
1368 Py_ssize_t num_surrogates;
1369#if SIZEOF_WCHAR_T == 2
1370 Py_ssize_t length_wo_surrogates;
1371#endif
1372
Georg Brandl7597add2011-10-05 16:36:47 +02001373 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001374 strings were created using _PyObject_New() and where no canonical
1375 representation (the str field) has been set yet aka strings
1376 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001377 assert(_PyUnicode_CHECK(unicode));
1378 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001380 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001381 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001382 /* Actually, it should neither be interned nor be anything else: */
1383 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384
1385#ifdef Py_DEBUG
1386 ++unicode_ready_calls;
1387#endif
1388
1389 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001390 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001391 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393
1394 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001395 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1396 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397 PyErr_NoMemory();
1398 return -1;
1399 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001400 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 _PyUnicode_WSTR(unicode), end,
1402 PyUnicode_1BYTE_DATA(unicode));
1403 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1404 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1405 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1406 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001407 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 }
1411 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001412 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001413 _PyUnicode_UTF8(unicode) = NULL;
1414 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 }
1416 PyObject_FREE(_PyUnicode_WSTR(unicode));
1417 _PyUnicode_WSTR(unicode) = NULL;
1418 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1419 }
1420 /* In this case we might have to convert down from 4-byte native
1421 wchar_t to 2-byte unicode. */
1422 else if (maxchar < 65536) {
1423 assert(num_surrogates == 0 &&
1424 "FindMaxCharAndNumSurrogatePairs() messed up");
1425
Victor Stinner506f5922011-09-28 22:34:18 +02001426#if SIZEOF_WCHAR_T == 2
1427 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001428 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434#else
1435 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001436 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001437 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001438 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001439 PyErr_NoMemory();
1440 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinner506f5922011-09-28 22:34:18 +02001442 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1443 _PyUnicode_WSTR(unicode), end,
1444 PyUnicode_2BYTE_DATA(unicode));
1445 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1446 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1447 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001448 _PyUnicode_UTF8(unicode) = NULL;
1449 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001450 PyObject_FREE(_PyUnicode_WSTR(unicode));
1451 _PyUnicode_WSTR(unicode) = NULL;
1452 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1453#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 }
1455 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1456 else {
1457#if SIZEOF_WCHAR_T == 2
1458 /* in case the native representation is 2-bytes, we need to allocate a
1459 new normalized 4-byte version. */
1460 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001461 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1462 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 PyErr_NoMemory();
1464 return -1;
1465 }
1466 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001468 _PyUnicode_UTF8(unicode) = NULL;
1469 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001470 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001472 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 PyObject_FREE(_PyUnicode_WSTR(unicode));
1474 _PyUnicode_WSTR(unicode) = NULL;
1475 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1476#else
1477 assert(num_surrogates == 0);
1478
Victor Stinnerc3c74152011-10-02 20:39:55 +02001479 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001481 _PyUnicode_UTF8(unicode) = NULL;
1482 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1484#endif
1485 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1486 }
1487 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001488 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 return 0;
1490}
1491
Alexander Belopolsky40018472011-02-26 01:02:56 +00001492static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001493unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494{
Walter Dörwald16807132007-05-25 13:52:07 +00001495 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 case SSTATE_NOT_INTERNED:
1497 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001498
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 case SSTATE_INTERNED_MORTAL:
1500 /* revive dead object temporarily for DelItem */
1501 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001502 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001503 Py_FatalError(
1504 "deletion of interned string failed");
1505 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001506
Benjamin Peterson29060642009-01-31 22:14:21 +00001507 case SSTATE_INTERNED_IMMORTAL:
1508 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001509
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 default:
1511 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001512 }
1513
Victor Stinner03490912011-10-03 23:45:12 +02001514 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001516 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001517 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001518 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1519 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001521 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522}
1523
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524#ifdef Py_DEBUG
1525static int
1526unicode_is_singleton(PyObject *unicode)
1527{
1528 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1529 if (unicode == unicode_empty)
1530 return 1;
1531 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1532 {
1533 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1534 if (ch < 256 && unicode_latin1[ch] == unicode)
1535 return 1;
1536 }
1537 return 0;
1538}
1539#endif
1540
Alexander Belopolsky40018472011-02-26 01:02:56 +00001541static int
Victor Stinner488fa492011-12-12 00:01:39 +01001542unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543{
Victor Stinner488fa492011-12-12 00:01:39 +01001544 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545 if (Py_REFCNT(unicode) != 1)
1546 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001547 if (_PyUnicode_HASH(unicode) != -1)
1548 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 if (PyUnicode_CHECK_INTERNED(unicode))
1550 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001551 if (!PyUnicode_CheckExact(unicode))
1552 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001553#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001554 /* singleton refcount is greater than 1 */
1555 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001556#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 return 1;
1558}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001559
Victor Stinnerfe226c02011-10-03 03:52:20 +02001560static int
1561unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1562{
1563 PyObject *unicode;
1564 Py_ssize_t old_length;
1565
1566 assert(p_unicode != NULL);
1567 unicode = *p_unicode;
1568
1569 assert(unicode != NULL);
1570 assert(PyUnicode_Check(unicode));
1571 assert(0 <= length);
1572
Victor Stinner910337b2011-10-03 03:20:16 +02001573 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001574 old_length = PyUnicode_WSTR_LENGTH(unicode);
1575 else
1576 old_length = PyUnicode_GET_LENGTH(unicode);
1577 if (old_length == length)
1578 return 0;
1579
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001580 if (length == 0) {
1581 Py_DECREF(*p_unicode);
1582 *p_unicode = unicode_empty;
1583 Py_INCREF(*p_unicode);
1584 return 0;
1585 }
1586
Victor Stinner488fa492011-12-12 00:01:39 +01001587 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 PyObject *copy = resize_copy(unicode, length);
1589 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001590 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001591 Py_DECREF(*p_unicode);
1592 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001594 }
1595
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001597 PyObject *new_unicode = resize_compact(unicode, length);
1598 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001600 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001601 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001602 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001603 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001604 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001605}
1606
Alexander Belopolsky40018472011-02-26 01:02:56 +00001607int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001608PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001609{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001610 PyObject *unicode;
1611 if (p_unicode == NULL) {
1612 PyErr_BadInternalCall();
1613 return -1;
1614 }
1615 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001616 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001617 {
1618 PyErr_BadInternalCall();
1619 return -1;
1620 }
1621 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001622}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001623
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001624static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001625unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001626{
1627 PyObject *result;
1628 assert(PyUnicode_IS_READY(*p_unicode));
1629 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1630 return 0;
1631 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1632 maxchar);
1633 if (result == NULL)
1634 return -1;
1635 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1636 PyUnicode_GET_LENGTH(*p_unicode));
1637 Py_DECREF(*p_unicode);
1638 *p_unicode = result;
1639 return 0;
1640}
1641
1642static int
1643unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1644 Py_UCS4 ch)
1645{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001646 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647 if (unicode_widen(p_unicode, ch) < 0)
1648 return -1;
1649 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1650 PyUnicode_DATA(*p_unicode),
1651 (*pos)++, ch);
1652 return 0;
1653}
1654
Victor Stinnerc5166102012-02-22 13:55:02 +01001655/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1656 Return the length of the input string.
1657
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001658 WARNING: The function doesn't copy the terminating null character and
1659 doesn't check the maximum character (may write a latin1 character in an
1660 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001661static Py_ssize_t
1662unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1663{
1664 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1665 void *data = PyUnicode_DATA(unicode);
1666
1667 switch (kind) {
1668 case PyUnicode_1BYTE_KIND: {
1669 Py_ssize_t len = strlen(str);
1670 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001671 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001672 return len;
1673 }
1674 case PyUnicode_2BYTE_KIND: {
1675 Py_UCS2 *start = (Py_UCS2 *)data + index;
1676 Py_UCS2 *ucs2 = start;
1677 assert(index <= PyUnicode_GET_LENGTH(unicode));
1678
1679 for (; *str; ++ucs2, ++str)
1680 *ucs2 = (Py_UCS2)*str;
1681
1682 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1683 return ucs2 - start;
1684 }
1685 default: {
1686 Py_UCS4 *start = (Py_UCS4 *)data + index;
1687 Py_UCS4 *ucs4 = start;
1688 assert(kind == PyUnicode_4BYTE_KIND);
1689 assert(index <= PyUnicode_GET_LENGTH(unicode));
1690
1691 for (; *str; ++ucs4, ++str)
1692 *ucs4 = (Py_UCS4)*str;
1693
1694 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1695 return ucs4 - start;
1696 }
1697 }
1698}
1699
1700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701static PyObject*
1702get_latin1_char(unsigned char ch)
1703{
Victor Stinnera464fc12011-10-02 20:39:30 +02001704 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001706 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 if (!unicode)
1708 return NULL;
1709 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001710 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 unicode_latin1[ch] = unicode;
1712 }
1713 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001714 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715}
1716
Alexander Belopolsky40018472011-02-26 01:02:56 +00001717PyObject *
1718PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001720 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721 Py_UCS4 maxchar = 0;
1722 Py_ssize_t num_surrogates;
1723
1724 if (u == NULL)
1725 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001727 /* If the Unicode data is known at construction time, we can apply
1728 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 /* Optimization for empty strings */
1731 if (size == 0 && unicode_empty != NULL) {
1732 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001733 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001734 }
Tim Petersced69f82003-09-16 20:30:58 +00001735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 /* Single character Unicode objects in the Latin-1 range are
1737 shared when using this constructor */
1738 if (size == 1 && *u < 256)
1739 return get_latin1_char((unsigned char)*u);
1740
1741 /* If not empty and not single character, copy the Unicode data
1742 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001743 if (find_maxchar_surrogates(u, u + size,
1744 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 return NULL;
1746
Victor Stinner8faf8212011-12-08 22:14:11 +01001747 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 if (!unicode)
1749 return NULL;
1750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 switch (PyUnicode_KIND(unicode)) {
1752 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001753 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1755 break;
1756 case PyUnicode_2BYTE_KIND:
1757#if Py_UNICODE_SIZE == 2
1758 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1759#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001760 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1762#endif
1763 break;
1764 case PyUnicode_4BYTE_KIND:
1765#if SIZEOF_WCHAR_T == 2
1766 /* This is the only case which has to process surrogates, thus
1767 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001768 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769#else
1770 assert(num_surrogates == 0);
1771 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1772#endif
1773 break;
1774 default:
1775 assert(0 && "Impossible state");
1776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001778 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779}
1780
Alexander Belopolsky40018472011-02-26 01:02:56 +00001781PyObject *
1782PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001783{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001784 if (size < 0) {
1785 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 return NULL;
1788 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001789 if (u != NULL)
1790 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1791 else
1792 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001793}
1794
Alexander Belopolsky40018472011-02-26 01:02:56 +00001795PyObject *
1796PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001797{
1798 size_t size = strlen(u);
1799 if (size > PY_SSIZE_T_MAX) {
1800 PyErr_SetString(PyExc_OverflowError, "input too long");
1801 return NULL;
1802 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001803 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001804}
1805
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001806PyObject *
1807_PyUnicode_FromId(_Py_Identifier *id)
1808{
1809 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001810 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1811 strlen(id->string),
1812 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001813 if (!id->object)
1814 return NULL;
1815 PyUnicode_InternInPlace(&id->object);
1816 assert(!id->next);
1817 id->next = static_strings;
1818 static_strings = id;
1819 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001820 return id->object;
1821}
1822
1823void
1824_PyUnicode_ClearStaticStrings()
1825{
1826 _Py_Identifier *i;
1827 for (i = static_strings; i; i = i->next) {
1828 Py_DECREF(i->object);
1829 i->object = NULL;
1830 i->next = NULL;
1831 }
1832}
1833
Benjamin Peterson0df54292012-03-26 14:50:32 -04001834/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001835
Victor Stinnere57b1c02011-09-28 22:20:48 +02001836static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001837unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001838{
Victor Stinner785938e2011-12-11 20:09:03 +01001839 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001840 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001841#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001842 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001843#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001844 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001845 }
Victor Stinner785938e2011-12-11 20:09:03 +01001846 unicode = PyUnicode_New(size, 127);
1847 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001848 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001849 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1850 assert(_PyUnicode_CheckConsistency(unicode, 1));
1851 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001852}
1853
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001854static Py_UCS4
1855kind_maxchar_limit(unsigned int kind)
1856{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001857 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001858 case PyUnicode_1BYTE_KIND:
1859 return 0x80;
1860 case PyUnicode_2BYTE_KIND:
1861 return 0x100;
1862 case PyUnicode_4BYTE_KIND:
1863 return 0x10000;
1864 default:
1865 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001866 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001867 }
1868}
1869
Victor Stinner702c7342011-10-05 13:50:52 +02001870static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001871_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001874 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001875
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001876 if (size == 0) {
1877 Py_INCREF(unicode_empty);
1878 return unicode_empty;
1879 }
1880 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001881 if (size == 1)
1882 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001883
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001884 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001885 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 if (!res)
1887 return NULL;
1888 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001889 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001891}
1892
Victor Stinnere57b1c02011-09-28 22:20:48 +02001893static PyObject*
1894_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895{
1896 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001897 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001898
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001899 if (size == 0) {
1900 Py_INCREF(unicode_empty);
1901 return unicode_empty;
1902 }
1903 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001904 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001905 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001906
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001907 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001908 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 if (!res)
1910 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001913 else {
1914 _PyUnicode_CONVERT_BYTES(
1915 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1916 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001917 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 return res;
1919}
1920
Victor Stinnere57b1c02011-09-28 22:20:48 +02001921static PyObject*
1922_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923{
1924 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001925 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001926
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001927 if (size == 0) {
1928 Py_INCREF(unicode_empty);
1929 return unicode_empty;
1930 }
1931 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001932 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001933 return get_latin1_char((unsigned char)u[0]);
1934
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001936 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 if (!res)
1938 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001939 if (max_char < 256)
1940 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1941 PyUnicode_1BYTE_DATA(res));
1942 else if (max_char < 0x10000)
1943 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1944 PyUnicode_2BYTE_DATA(res));
1945 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001947 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 return res;
1949}
1950
1951PyObject*
1952PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1953{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001954 if (size < 0) {
1955 PyErr_SetString(PyExc_ValueError, "size must be positive");
1956 return NULL;
1957 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001958 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001960 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001962 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001964 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001965 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001966 PyErr_SetString(PyExc_SystemError, "invalid kind");
1967 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969}
1970
Victor Stinnerece58de2012-04-23 23:36:38 +02001971Py_UCS4
1972_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
1973{
1974 enum PyUnicode_Kind kind;
1975 void *startptr, *endptr;
1976
1977 assert(PyUnicode_IS_READY(unicode));
1978 assert(0 <= start);
1979 assert(end <= PyUnicode_GET_LENGTH(unicode));
1980 assert(start <= end);
1981
1982 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
1983 return PyUnicode_MAX_CHAR_VALUE(unicode);
1984
1985 if (start == end)
1986 return 127;
1987
Victor Stinner94d558b2012-04-27 22:26:58 +02001988 if (PyUnicode_IS_ASCII(unicode))
1989 return 127;
1990
Victor Stinnerece58de2012-04-23 23:36:38 +02001991 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04001992 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04001993 endptr = (char *)startptr + end * kind;
1994 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04001995 switch(kind) {
1996 case PyUnicode_1BYTE_KIND:
1997 return ucs1lib_find_max_char(startptr, endptr);
1998 case PyUnicode_2BYTE_KIND:
1999 return ucs2lib_find_max_char(startptr, endptr);
2000 case PyUnicode_4BYTE_KIND:
2001 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002002 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002003 assert(0);
2004 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002005 }
2006}
2007
Victor Stinner25a4b292011-10-06 12:31:55 +02002008/* Ensure that a string uses the most efficient storage, if it is not the
2009 case: create a new string with of the right kind. Write NULL into *p_unicode
2010 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002011static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002012unicode_adjust_maxchar(PyObject **p_unicode)
2013{
2014 PyObject *unicode, *copy;
2015 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002016 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002017 unsigned int kind;
2018
2019 assert(p_unicode != NULL);
2020 unicode = *p_unicode;
2021 assert(PyUnicode_IS_READY(unicode));
2022 if (PyUnicode_IS_ASCII(unicode))
2023 return;
2024
2025 len = PyUnicode_GET_LENGTH(unicode);
2026 kind = PyUnicode_KIND(unicode);
2027 if (kind == PyUnicode_1BYTE_KIND) {
2028 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002029 max_char = ucs1lib_find_max_char(u, u + len);
2030 if (max_char >= 128)
2031 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002032 }
2033 else if (kind == PyUnicode_2BYTE_KIND) {
2034 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002035 max_char = ucs2lib_find_max_char(u, u + len);
2036 if (max_char >= 256)
2037 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002038 }
2039 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002040 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002041 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002042 max_char = ucs4lib_find_max_char(u, u + len);
2043 if (max_char >= 0x10000)
2044 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002045 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002046 copy = PyUnicode_New(len, max_char);
2047 copy_characters(copy, 0, unicode, 0, len);
2048 Py_DECREF(unicode);
2049 *p_unicode = copy;
2050}
2051
Victor Stinner034f6cf2011-09-30 02:26:44 +02002052PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002053_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002054{
Victor Stinner87af4f22011-11-21 23:03:47 +01002055 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002056 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002057
Victor Stinner034f6cf2011-09-30 02:26:44 +02002058 if (!PyUnicode_Check(unicode)) {
2059 PyErr_BadInternalCall();
2060 return NULL;
2061 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002062 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002063 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002064
Victor Stinner87af4f22011-11-21 23:03:47 +01002065 length = PyUnicode_GET_LENGTH(unicode);
2066 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002067 if (!copy)
2068 return NULL;
2069 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2070
Victor Stinner87af4f22011-11-21 23:03:47 +01002071 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2072 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002073 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002074 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002075}
2076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077
Victor Stinnerbc603d12011-10-02 01:00:40 +02002078/* Widen Unicode objects to larger buffers. Don't write terminating null
2079 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080
2081void*
2082_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2083{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002084 Py_ssize_t len;
2085 void *result;
2086 unsigned int skind;
2087
Benjamin Petersonbac79492012-01-14 13:34:47 -05002088 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002089 return NULL;
2090
2091 len = PyUnicode_GET_LENGTH(s);
2092 skind = PyUnicode_KIND(s);
2093 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002094 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 return NULL;
2096 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002097 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002098 case PyUnicode_2BYTE_KIND:
2099 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2100 if (!result)
2101 return PyErr_NoMemory();
2102 assert(skind == PyUnicode_1BYTE_KIND);
2103 _PyUnicode_CONVERT_BYTES(
2104 Py_UCS1, Py_UCS2,
2105 PyUnicode_1BYTE_DATA(s),
2106 PyUnicode_1BYTE_DATA(s) + len,
2107 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002108 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002109 case PyUnicode_4BYTE_KIND:
2110 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2111 if (!result)
2112 return PyErr_NoMemory();
2113 if (skind == PyUnicode_2BYTE_KIND) {
2114 _PyUnicode_CONVERT_BYTES(
2115 Py_UCS2, Py_UCS4,
2116 PyUnicode_2BYTE_DATA(s),
2117 PyUnicode_2BYTE_DATA(s) + len,
2118 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002120 else {
2121 assert(skind == PyUnicode_1BYTE_KIND);
2122 _PyUnicode_CONVERT_BYTES(
2123 Py_UCS1, Py_UCS4,
2124 PyUnicode_1BYTE_DATA(s),
2125 PyUnicode_1BYTE_DATA(s) + len,
2126 result);
2127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002129 default:
2130 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 }
Victor Stinner01698042011-10-04 00:04:26 +02002132 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 return NULL;
2134}
2135
2136static Py_UCS4*
2137as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2138 int copy_null)
2139{
2140 int kind;
2141 void *data;
2142 Py_ssize_t len, targetlen;
2143 if (PyUnicode_READY(string) == -1)
2144 return NULL;
2145 kind = PyUnicode_KIND(string);
2146 data = PyUnicode_DATA(string);
2147 len = PyUnicode_GET_LENGTH(string);
2148 targetlen = len;
2149 if (copy_null)
2150 targetlen++;
2151 if (!target) {
2152 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2153 PyErr_NoMemory();
2154 return NULL;
2155 }
2156 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2157 if (!target) {
2158 PyErr_NoMemory();
2159 return NULL;
2160 }
2161 }
2162 else {
2163 if (targetsize < targetlen) {
2164 PyErr_Format(PyExc_SystemError,
2165 "string is longer than the buffer");
2166 if (copy_null && 0 < targetsize)
2167 target[0] = 0;
2168 return NULL;
2169 }
2170 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002171 if (kind == PyUnicode_1BYTE_KIND) {
2172 Py_UCS1 *start = (Py_UCS1 *) data;
2173 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002175 else if (kind == PyUnicode_2BYTE_KIND) {
2176 Py_UCS2 *start = (Py_UCS2 *) data;
2177 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2178 }
2179 else {
2180 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 if (copy_null)
2184 target[len] = 0;
2185 return target;
2186}
2187
2188Py_UCS4*
2189PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2190 int copy_null)
2191{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002192 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 PyErr_BadInternalCall();
2194 return NULL;
2195 }
2196 return as_ucs4(string, target, targetsize, copy_null);
2197}
2198
2199Py_UCS4*
2200PyUnicode_AsUCS4Copy(PyObject *string)
2201{
2202 return as_ucs4(string, NULL, 0, 1);
2203}
2204
2205#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002206
Alexander Belopolsky40018472011-02-26 01:02:56 +00002207PyObject *
2208PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002211 if (size == 0) {
2212 Py_INCREF(unicode_empty);
2213 return unicode_empty;
2214 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002215 PyErr_BadInternalCall();
2216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 }
2218
Martin v. Löwis790465f2008-04-05 20:41:37 +00002219 if (size == -1) {
2220 size = wcslen(w);
2221 }
2222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224}
2225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002227
Walter Dörwald346737f2007-05-31 10:44:43 +00002228static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2230 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002231{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002232 *fmt++ = '%';
2233 if (width) {
2234 if (zeropad)
2235 *fmt++ = '0';
2236 fmt += sprintf(fmt, "%d", width);
2237 }
2238 if (precision)
2239 fmt += sprintf(fmt, ".%d", precision);
2240 if (longflag)
2241 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002242 else if (longlongflag) {
2243 /* longlongflag should only ever be nonzero on machines with
2244 HAVE_LONG_LONG defined */
2245#ifdef HAVE_LONG_LONG
2246 char *f = PY_FORMAT_LONG_LONG;
2247 while (*f)
2248 *fmt++ = *f++;
2249#else
2250 /* we shouldn't ever get here */
2251 assert(0);
2252 *fmt++ = 'l';
2253#endif
2254 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002255 else if (size_tflag) {
2256 char *f = PY_FORMAT_SIZE_T;
2257 while (*f)
2258 *fmt++ = *f++;
2259 }
2260 *fmt++ = c;
2261 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002262}
2263
Victor Stinner96865452011-03-01 23:44:09 +00002264/* helper for PyUnicode_FromFormatV() */
2265
2266static const char*
2267parse_format_flags(const char *f,
2268 int *p_width, int *p_precision,
2269 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2270{
2271 int width, precision, longflag, longlongflag, size_tflag;
2272
2273 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2274 f++;
2275 width = 0;
2276 while (Py_ISDIGIT((unsigned)*f))
2277 width = (width*10) + *f++ - '0';
2278 precision = 0;
2279 if (*f == '.') {
2280 f++;
2281 while (Py_ISDIGIT((unsigned)*f))
2282 precision = (precision*10) + *f++ - '0';
2283 if (*f == '%') {
2284 /* "%.3%s" => f points to "3" */
2285 f--;
2286 }
2287 }
2288 if (*f == '\0') {
2289 /* bogus format "%.1" => go backward, f points to "1" */
2290 f--;
2291 }
2292 if (p_width != NULL)
2293 *p_width = width;
2294 if (p_precision != NULL)
2295 *p_precision = precision;
2296
2297 /* Handle %ld, %lu, %lld and %llu. */
2298 longflag = 0;
2299 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002300 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002301
2302 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002303 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002304 longflag = 1;
2305 ++f;
2306 }
2307#ifdef HAVE_LONG_LONG
2308 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002309 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002310 longlongflag = 1;
2311 f += 2;
2312 }
2313#endif
2314 }
2315 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002316 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002317 size_tflag = 1;
2318 ++f;
2319 }
2320 if (p_longflag != NULL)
2321 *p_longflag = longflag;
2322 if (p_longlongflag != NULL)
2323 *p_longlongflag = longlongflag;
2324 if (p_size_tflag != NULL)
2325 *p_size_tflag = size_tflag;
2326 return f;
2327}
2328
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002329/* maximum number of characters required for output of %ld. 21 characters
2330 allows for 64-bit integers (in decimal) and an optional sign. */
2331#define MAX_LONG_CHARS 21
2332/* maximum number of characters required for output of %lld.
2333 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2334 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2335#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2336
Walter Dörwaldd2034312007-05-18 16:29:38 +00002337PyObject *
2338PyUnicode_FromFormatV(const char *format, va_list vargs)
2339{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002340 va_list count;
2341 Py_ssize_t callcount = 0;
2342 PyObject **callresults = NULL;
2343 PyObject **callresult = NULL;
2344 Py_ssize_t n = 0;
2345 int width = 0;
2346 int precision = 0;
2347 int zeropad;
2348 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002349 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002350 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002351 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2353 Py_UCS4 argmaxchar;
2354 Py_ssize_t numbersize = 0;
2355 char *numberresults = NULL;
2356 char *numberresult = NULL;
2357 Py_ssize_t i;
2358 int kind;
2359 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002360
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002361 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002362 /* step 1: count the number of %S/%R/%A/%s format specifications
2363 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2364 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002366 * also estimate a upper bound for all the number formats in the string,
2367 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002369 for (f = format; *f; f++) {
2370 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002371 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2373 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2374 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2375 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002378#ifdef HAVE_LONG_LONG
2379 if (longlongflag) {
2380 if (width < MAX_LONG_LONG_CHARS)
2381 width = MAX_LONG_LONG_CHARS;
2382 }
2383 else
2384#endif
2385 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2386 including sign. Decimal takes the most space. This
2387 isn't enough for octal. If a width is specified we
2388 need more (which we allocate later). */
2389 if (width < MAX_LONG_CHARS)
2390 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391
2392 /* account for the size + '\0' to separate numbers
2393 inside of the numberresults buffer */
2394 numbersize += (width + 1);
2395 }
2396 }
2397 else if ((unsigned char)*f > 127) {
2398 PyErr_Format(PyExc_ValueError,
2399 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2400 "string, got a non-ASCII byte: 0x%02x",
2401 (unsigned char)*f);
2402 return NULL;
2403 }
2404 }
2405 /* step 2: allocate memory for the results of
2406 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2407 if (callcount) {
2408 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2409 if (!callresults) {
2410 PyErr_NoMemory();
2411 return NULL;
2412 }
2413 callresult = callresults;
2414 }
2415 /* step 2.5: allocate memory for the results of formating numbers */
2416 if (numbersize) {
2417 numberresults = PyObject_Malloc(numbersize);
2418 if (!numberresults) {
2419 PyErr_NoMemory();
2420 goto fail;
2421 }
2422 numberresult = numberresults;
2423 }
2424
2425 /* step 3: format numbers and figure out how large a buffer we need */
2426 for (f = format; *f; f++) {
2427 if (*f == '%') {
2428 const char* p;
2429 int longflag;
2430 int longlongflag;
2431 int size_tflag;
2432 int numprinted;
2433
2434 p = f;
2435 zeropad = (f[1] == '0');
2436 f = parse_format_flags(f, &width, &precision,
2437 &longflag, &longlongflag, &size_tflag);
2438 switch (*f) {
2439 case 'c':
2440 {
2441 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002442 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 n++;
2444 break;
2445 }
2446 case '%':
2447 n++;
2448 break;
2449 case 'i':
2450 case 'd':
2451 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2452 width, precision, *f);
2453 if (longflag)
2454 numprinted = sprintf(numberresult, fmt,
2455 va_arg(count, long));
2456#ifdef HAVE_LONG_LONG
2457 else if (longlongflag)
2458 numprinted = sprintf(numberresult, fmt,
2459 va_arg(count, PY_LONG_LONG));
2460#endif
2461 else if (size_tflag)
2462 numprinted = sprintf(numberresult, fmt,
2463 va_arg(count, Py_ssize_t));
2464 else
2465 numprinted = sprintf(numberresult, fmt,
2466 va_arg(count, int));
2467 n += numprinted;
2468 /* advance by +1 to skip over the '\0' */
2469 numberresult += (numprinted + 1);
2470 assert(*(numberresult - 1) == '\0');
2471 assert(*(numberresult - 2) != '\0');
2472 assert(numprinted >= 0);
2473 assert(numberresult <= numberresults + numbersize);
2474 break;
2475 case 'u':
2476 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2477 width, precision, 'u');
2478 if (longflag)
2479 numprinted = sprintf(numberresult, fmt,
2480 va_arg(count, unsigned long));
2481#ifdef HAVE_LONG_LONG
2482 else if (longlongflag)
2483 numprinted = sprintf(numberresult, fmt,
2484 va_arg(count, unsigned PY_LONG_LONG));
2485#endif
2486 else if (size_tflag)
2487 numprinted = sprintf(numberresult, fmt,
2488 va_arg(count, size_t));
2489 else
2490 numprinted = sprintf(numberresult, fmt,
2491 va_arg(count, unsigned int));
2492 n += numprinted;
2493 numberresult += (numprinted + 1);
2494 assert(*(numberresult - 1) == '\0');
2495 assert(*(numberresult - 2) != '\0');
2496 assert(numprinted >= 0);
2497 assert(numberresult <= numberresults + numbersize);
2498 break;
2499 case 'x':
2500 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2501 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2502 n += numprinted;
2503 numberresult += (numprinted + 1);
2504 assert(*(numberresult - 1) == '\0');
2505 assert(*(numberresult - 2) != '\0');
2506 assert(numprinted >= 0);
2507 assert(numberresult <= numberresults + numbersize);
2508 break;
2509 case 'p':
2510 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2511 /* %p is ill-defined: ensure leading 0x. */
2512 if (numberresult[1] == 'X')
2513 numberresult[1] = 'x';
2514 else if (numberresult[1] != 'x') {
2515 memmove(numberresult + 2, numberresult,
2516 strlen(numberresult) + 1);
2517 numberresult[0] = '0';
2518 numberresult[1] = 'x';
2519 numprinted += 2;
2520 }
2521 n += numprinted;
2522 numberresult += (numprinted + 1);
2523 assert(*(numberresult - 1) == '\0');
2524 assert(*(numberresult - 2) != '\0');
2525 assert(numprinted >= 0);
2526 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 break;
2528 case 's':
2529 {
2530 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002531 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002532 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002533 if (!str)
2534 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002535 /* since PyUnicode_DecodeUTF8 returns already flexible
2536 unicode objects, there is no need to call ready on them */
2537 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002538 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002540 /* Remember the str and switch to the next slot */
2541 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 break;
2543 }
2544 case 'U':
2545 {
2546 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002547 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 if (PyUnicode_READY(obj) == -1)
2549 goto fail;
2550 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002551 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 break;
2554 }
2555 case 'V':
2556 {
2557 PyObject *obj = va_arg(count, PyObject *);
2558 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002559 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002561 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002562 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 if (PyUnicode_READY(obj) == -1)
2564 goto fail;
2565 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002566 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002568 *callresult++ = NULL;
2569 }
2570 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002571 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002572 if (!str_obj)
2573 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002574 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002575 Py_DECREF(str_obj);
2576 goto fail;
2577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002579 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002581 *callresult++ = str_obj;
2582 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
2584 }
2585 case 'S':
2586 {
2587 PyObject *obj = va_arg(count, PyObject *);
2588 PyObject *str;
2589 assert(obj);
2590 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002591 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002592 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002593 if (PyUnicode_READY(str) == -1) {
2594 Py_DECREF(str);
2595 goto fail;
2596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002598 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002599 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002600 /* Remember the str and switch to the next slot */
2601 *callresult++ = str;
2602 break;
2603 }
2604 case 'R':
2605 {
2606 PyObject *obj = va_arg(count, PyObject *);
2607 PyObject *repr;
2608 assert(obj);
2609 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002610 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002611 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002612 if (PyUnicode_READY(repr) == -1) {
2613 Py_DECREF(repr);
2614 goto fail;
2615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002617 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002619 /* Remember the repr and switch to the next slot */
2620 *callresult++ = repr;
2621 break;
2622 }
2623 case 'A':
2624 {
2625 PyObject *obj = va_arg(count, PyObject *);
2626 PyObject *ascii;
2627 assert(obj);
2628 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002629 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002631 if (PyUnicode_READY(ascii) == -1) {
2632 Py_DECREF(ascii);
2633 goto fail;
2634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002636 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 /* Remember the repr and switch to the next slot */
2639 *callresult++ = ascii;
2640 break;
2641 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 default:
2643 /* if we stumble upon an unknown
2644 formatting code, copy the rest of
2645 the format string to the output
2646 string. (we cannot just skip the
2647 code, since there's no way to know
2648 what's in the argument list) */
2649 n += strlen(p);
2650 goto expand;
2651 }
2652 } else
2653 n++;
2654 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002655 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 we don't have to resize the string.
2659 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002660 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 if (!string)
2662 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 kind = PyUnicode_KIND(string);
2664 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002669 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002670 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002671
2672 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2674 /* checking for == because the last argument could be a empty
2675 string, which causes i to point to end, the assert at the end of
2676 the loop */
2677 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002678
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 switch (*f) {
2680 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002681 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 const int ordinal = va_arg(vargs, int);
2683 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002685 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002686 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002687 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002689 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002691 {
2692 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002693 /* unused, since we already have the result */
2694 if (*f == 'p')
2695 (void) va_arg(vargs, void *);
2696 else
2697 (void) va_arg(vargs, int);
2698 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002699 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002701 i += written;
2702 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 assert(*numberresult == '\0');
2704 numberresult++;
2705 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 case 's':
2709 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002710 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002712 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 size = PyUnicode_GET_LENGTH(*callresult);
2714 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002715 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002717 /* We're done with the unicode()/repr() => forget it */
2718 Py_DECREF(*callresult);
2719 /* switch to next unicode()/repr() result */
2720 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 break;
2722 }
2723 case 'U':
2724 {
2725 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002726 Py_ssize_t size;
2727 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2728 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002729 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002731 break;
2732 }
2733 case 'V':
2734 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002737 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002738 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 size = PyUnicode_GET_LENGTH(obj);
2740 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002741 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002744 size = PyUnicode_GET_LENGTH(*callresult);
2745 assert(PyUnicode_KIND(*callresult) <=
2746 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002747 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002749 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002750 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002751 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002752 break;
2753 }
2754 case 'S':
2755 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002756 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002757 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002758 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002759 /* unused, since we already have the result */
2760 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002762 copy_characters(string, i, *callresult, 0, size);
2763 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002764 /* We're done with the unicode()/repr() => forget it */
2765 Py_DECREF(*callresult);
2766 /* switch to next unicode()/repr() result */
2767 ++callresult;
2768 break;
2769 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002770 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002771 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 break;
2773 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002774 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002776 goto end;
2777 }
Victor Stinner1205f272010-09-11 00:54:47 +00002778 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 else {
2780 assert(i < PyUnicode_GET_LENGTH(string));
2781 PyUnicode_WRITE(kind, data, i++, *f);
2782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002787 if (callresults)
2788 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 if (numberresults)
2790 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002791 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002793 if (callresults) {
2794 PyObject **callresult2 = callresults;
2795 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002796 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002797 ++callresult2;
2798 }
2799 PyObject_Free(callresults);
2800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 if (numberresults)
2802 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002803 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002804}
2805
Walter Dörwaldd2034312007-05-18 16:29:38 +00002806PyObject *
2807PyUnicode_FromFormat(const char *format, ...)
2808{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002809 PyObject* ret;
2810 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002811
2812#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002813 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002814#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002815 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002816#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002817 ret = PyUnicode_FromFormatV(format, vargs);
2818 va_end(vargs);
2819 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002820}
2821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002822#ifdef HAVE_WCHAR_H
2823
Victor Stinner5593d8a2010-10-02 11:11:27 +00002824/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2825 convert a Unicode object to a wide character string.
2826
Victor Stinnerd88d9832011-09-06 02:00:05 +02002827 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002828 character) required to convert the unicode object. Ignore size argument.
2829
Victor Stinnerd88d9832011-09-06 02:00:05 +02002830 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002831 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002832 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002833static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002834unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002835 wchar_t *w,
2836 Py_ssize_t size)
2837{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002838 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 const wchar_t *wstr;
2840
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002841 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842 if (wstr == NULL)
2843 return -1;
2844
Victor Stinner5593d8a2010-10-02 11:11:27 +00002845 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002846 if (size > res)
2847 size = res + 1;
2848 else
2849 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002850 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002851 return res;
2852 }
2853 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002854 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002855}
2856
2857Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002858PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002859 wchar_t *w,
2860 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861{
2862 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002863 PyErr_BadInternalCall();
2864 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002866 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867}
2868
Victor Stinner137c34c2010-09-29 10:25:54 +00002869wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002870PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002871 Py_ssize_t *size)
2872{
2873 wchar_t* buffer;
2874 Py_ssize_t buflen;
2875
2876 if (unicode == NULL) {
2877 PyErr_BadInternalCall();
2878 return NULL;
2879 }
2880
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002881 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002882 if (buflen == -1)
2883 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002884 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002885 PyErr_NoMemory();
2886 return NULL;
2887 }
2888
Victor Stinner137c34c2010-09-29 10:25:54 +00002889 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2890 if (buffer == NULL) {
2891 PyErr_NoMemory();
2892 return NULL;
2893 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002894 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002895 if (buflen == -1)
2896 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002897 if (size != NULL)
2898 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002899 return buffer;
2900}
2901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002902#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903
Alexander Belopolsky40018472011-02-26 01:02:56 +00002904PyObject *
2905PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002908 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002909 PyErr_SetString(PyExc_ValueError,
2910 "chr() arg not in range(0x110000)");
2911 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002912 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002914 if (ordinal < 256)
2915 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002917 v = PyUnicode_New(1, ordinal);
2918 if (v == NULL)
2919 return NULL;
2920 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002921 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002923}
2924
Alexander Belopolsky40018472011-02-26 01:02:56 +00002925PyObject *
2926PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002928 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002930 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002931 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002932 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 Py_INCREF(obj);
2934 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002935 }
2936 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002937 /* For a Unicode subtype that's not a Unicode object,
2938 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002939 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002940 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002941 PyErr_Format(PyExc_TypeError,
2942 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002943 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002944 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002945}
2946
Alexander Belopolsky40018472011-02-26 01:02:56 +00002947PyObject *
2948PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002949 const char *encoding,
2950 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002951{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002952 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002953 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002954
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002956 PyErr_BadInternalCall();
2957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002959
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002960 /* Decoding bytes objects is the most common case and should be fast */
2961 if (PyBytes_Check(obj)) {
2962 if (PyBytes_GET_SIZE(obj) == 0) {
2963 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002964 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002965 }
2966 else {
2967 v = PyUnicode_Decode(
2968 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2969 encoding, errors);
2970 }
2971 return v;
2972 }
2973
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002974 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 PyErr_SetString(PyExc_TypeError,
2976 "decoding str is not supported");
2977 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002978 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002979
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002980 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2981 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2982 PyErr_Format(PyExc_TypeError,
2983 "coercing to str: need bytes, bytearray "
2984 "or buffer-like object, %.80s found",
2985 Py_TYPE(obj)->tp_name);
2986 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002987 }
Tim Petersced69f82003-09-16 20:30:58 +00002988
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002989 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002990 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002991 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 }
Tim Petersced69f82003-09-16 20:30:58 +00002993 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002994 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002995
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002996 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002997 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998}
2999
Victor Stinner600d3be2010-06-10 12:00:55 +00003000/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003001 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3002 1 on success. */
3003static int
3004normalize_encoding(const char *encoding,
3005 char *lower,
3006 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003008 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003009 char *l;
3010 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003012 if (encoding == NULL) {
3013 strcpy(lower, "utf-8");
3014 return 1;
3015 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003016 e = encoding;
3017 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003018 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003019 while (*e) {
3020 if (l == l_end)
3021 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003022 if (Py_ISUPPER(*e)) {
3023 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003024 }
3025 else if (*e == '_') {
3026 *l++ = '-';
3027 e++;
3028 }
3029 else {
3030 *l++ = *e++;
3031 }
3032 }
3033 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003034 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003035}
3036
Alexander Belopolsky40018472011-02-26 01:02:56 +00003037PyObject *
3038PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003039 Py_ssize_t size,
3040 const char *encoding,
3041 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003042{
3043 PyObject *buffer = NULL, *unicode;
3044 Py_buffer info;
3045 char lower[11]; /* Enough for any encoding shortcut */
3046
Fred Drakee4315f52000-05-09 19:53:39 +00003047 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003048 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003049 if ((strcmp(lower, "utf-8") == 0) ||
3050 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003051 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003052 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003053 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003054 (strcmp(lower, "iso-8859-1") == 0))
3055 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003056#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003057 else if (strcmp(lower, "mbcs") == 0)
3058 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003059#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003060 else if (strcmp(lower, "ascii") == 0)
3061 return PyUnicode_DecodeASCII(s, size, errors);
3062 else if (strcmp(lower, "utf-16") == 0)
3063 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3064 else if (strcmp(lower, "utf-32") == 0)
3065 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067
3068 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003069 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003070 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003071 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003072 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 if (buffer == NULL)
3074 goto onError;
3075 unicode = PyCodec_Decode(buffer, encoding, errors);
3076 if (unicode == NULL)
3077 goto onError;
3078 if (!PyUnicode_Check(unicode)) {
3079 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003080 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003081 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 Py_DECREF(unicode);
3083 goto onError;
3084 }
3085 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003086 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003087
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 Py_XDECREF(buffer);
3090 return NULL;
3091}
3092
Alexander Belopolsky40018472011-02-26 01:02:56 +00003093PyObject *
3094PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003095 const char *encoding,
3096 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003097{
3098 PyObject *v;
3099
3100 if (!PyUnicode_Check(unicode)) {
3101 PyErr_BadArgument();
3102 goto onError;
3103 }
3104
3105 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003106 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003107
3108 /* Decode via the codec registry */
3109 v = PyCodec_Decode(unicode, encoding, errors);
3110 if (v == NULL)
3111 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003112 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003113
Benjamin Peterson29060642009-01-31 22:14:21 +00003114 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003115 return NULL;
3116}
3117
Alexander Belopolsky40018472011-02-26 01:02:56 +00003118PyObject *
3119PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003120 const char *encoding,
3121 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003122{
3123 PyObject *v;
3124
3125 if (!PyUnicode_Check(unicode)) {
3126 PyErr_BadArgument();
3127 goto onError;
3128 }
3129
3130 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003131 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003132
3133 /* Decode via the codec registry */
3134 v = PyCodec_Decode(unicode, encoding, errors);
3135 if (v == NULL)
3136 goto onError;
3137 if (!PyUnicode_Check(v)) {
3138 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003139 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003140 Py_TYPE(v)->tp_name);
3141 Py_DECREF(v);
3142 goto onError;
3143 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003144 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003145
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003147 return NULL;
3148}
3149
Alexander Belopolsky40018472011-02-26 01:02:56 +00003150PyObject *
3151PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003152 Py_ssize_t size,
3153 const char *encoding,
3154 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155{
3156 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003157
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 unicode = PyUnicode_FromUnicode(s, size);
3159 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3162 Py_DECREF(unicode);
3163 return v;
3164}
3165
Alexander Belopolsky40018472011-02-26 01:02:56 +00003166PyObject *
3167PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003168 const char *encoding,
3169 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003170{
3171 PyObject *v;
3172
3173 if (!PyUnicode_Check(unicode)) {
3174 PyErr_BadArgument();
3175 goto onError;
3176 }
3177
3178 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003180
3181 /* Encode via the codec registry */
3182 v = PyCodec_Encode(unicode, encoding, errors);
3183 if (v == NULL)
3184 goto onError;
3185 return v;
3186
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003188 return NULL;
3189}
3190
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003191static size_t
3192wcstombs_errorpos(const wchar_t *wstr)
3193{
3194 size_t len;
3195#if SIZEOF_WCHAR_T == 2
3196 wchar_t buf[3];
3197#else
3198 wchar_t buf[2];
3199#endif
3200 char outbuf[MB_LEN_MAX];
3201 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003202
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003203#if SIZEOF_WCHAR_T == 2
3204 buf[2] = 0;
3205#else
3206 buf[1] = 0;
3207#endif
3208 start = wstr;
3209 while (*wstr != L'\0')
3210 {
3211 previous = wstr;
3212#if SIZEOF_WCHAR_T == 2
3213 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3214 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3215 {
3216 buf[0] = wstr[0];
3217 buf[1] = wstr[1];
3218 wstr += 2;
3219 }
3220 else {
3221 buf[0] = *wstr;
3222 buf[1] = 0;
3223 wstr++;
3224 }
3225#else
3226 buf[0] = *wstr;
3227 wstr++;
3228#endif
3229 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003230 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003231 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003232 }
3233
3234 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235 return 0;
3236}
3237
Victor Stinner1b579672011-12-17 05:47:23 +01003238static int
3239locale_error_handler(const char *errors, int *surrogateescape)
3240{
3241 if (errors == NULL) {
3242 *surrogateescape = 0;
3243 return 0;
3244 }
3245
3246 if (strcmp(errors, "strict") == 0) {
3247 *surrogateescape = 0;
3248 return 0;
3249 }
3250 if (strcmp(errors, "surrogateescape") == 0) {
3251 *surrogateescape = 1;
3252 return 0;
3253 }
3254 PyErr_Format(PyExc_ValueError,
3255 "only 'strict' and 'surrogateescape' error handlers "
3256 "are supported, not '%s'",
3257 errors);
3258 return -1;
3259}
3260
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003261PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003262PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003263{
3264 Py_ssize_t wlen, wlen2;
3265 wchar_t *wstr;
3266 PyObject *bytes = NULL;
3267 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003268 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003269 PyObject *exc;
3270 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003271 int surrogateescape;
3272
3273 if (locale_error_handler(errors, &surrogateescape) < 0)
3274 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003275
3276 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3277 if (wstr == NULL)
3278 return NULL;
3279
3280 wlen2 = wcslen(wstr);
3281 if (wlen2 != wlen) {
3282 PyMem_Free(wstr);
3283 PyErr_SetString(PyExc_TypeError, "embedded null character");
3284 return NULL;
3285 }
3286
3287 if (surrogateescape) {
3288 /* locale encoding with surrogateescape */
3289 char *str;
3290
3291 str = _Py_wchar2char(wstr, &error_pos);
3292 if (str == NULL) {
3293 if (error_pos == (size_t)-1) {
3294 PyErr_NoMemory();
3295 PyMem_Free(wstr);
3296 return NULL;
3297 }
3298 else {
3299 goto encode_error;
3300 }
3301 }
3302 PyMem_Free(wstr);
3303
3304 bytes = PyBytes_FromString(str);
3305 PyMem_Free(str);
3306 }
3307 else {
3308 size_t len, len2;
3309
3310 len = wcstombs(NULL, wstr, 0);
3311 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003312 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003313 goto encode_error;
3314 }
3315
3316 bytes = PyBytes_FromStringAndSize(NULL, len);
3317 if (bytes == NULL) {
3318 PyMem_Free(wstr);
3319 return NULL;
3320 }
3321
3322 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3323 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003324 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003325 goto encode_error;
3326 }
3327 PyMem_Free(wstr);
3328 }
3329 return bytes;
3330
3331encode_error:
3332 errmsg = strerror(errno);
3333 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003334
3335 if (error_pos == (size_t)-1)
3336 error_pos = wcstombs_errorpos(wstr);
3337
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003338 PyMem_Free(wstr);
3339 Py_XDECREF(bytes);
3340
Victor Stinner2f197072011-12-17 07:08:30 +01003341 if (errmsg != NULL) {
3342 size_t errlen;
3343 wstr = _Py_char2wchar(errmsg, &errlen);
3344 if (wstr != NULL) {
3345 reason = PyUnicode_FromWideChar(wstr, errlen);
3346 PyMem_Free(wstr);
3347 } else
3348 errmsg = NULL;
3349 }
3350 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003351 reason = PyUnicode_FromString(
3352 "wcstombs() encountered an unencodable "
3353 "wide character");
3354 if (reason == NULL)
3355 return NULL;
3356
3357 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3358 "locale", unicode,
3359 (Py_ssize_t)error_pos,
3360 (Py_ssize_t)(error_pos+1),
3361 reason);
3362 Py_DECREF(reason);
3363 if (exc != NULL) {
3364 PyCodec_StrictErrors(exc);
3365 Py_XDECREF(exc);
3366 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003367 return NULL;
3368}
3369
Victor Stinnerad158722010-10-27 00:25:46 +00003370PyObject *
3371PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003372{
Victor Stinner99b95382011-07-04 14:23:54 +02003373#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003374 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003375#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003377#else
Victor Stinner793b5312011-04-27 00:24:21 +02003378 PyInterpreterState *interp = PyThreadState_GET()->interp;
3379 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3380 cannot use it to encode and decode filenames before it is loaded. Load
3381 the Python codec requires to encode at least its own filename. Use the C
3382 version of the locale codec until the codec registry is initialized and
3383 the Python codec is loaded.
3384
3385 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3386 cannot only rely on it: check also interp->fscodec_initialized for
3387 subinterpreters. */
3388 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003389 return PyUnicode_AsEncodedString(unicode,
3390 Py_FileSystemDefaultEncoding,
3391 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003392 }
3393 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003394 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003395 }
Victor Stinnerad158722010-10-27 00:25:46 +00003396#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003397}
3398
Alexander Belopolsky40018472011-02-26 01:02:56 +00003399PyObject *
3400PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003401 const char *encoding,
3402 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403{
3404 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003405 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003406
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 if (!PyUnicode_Check(unicode)) {
3408 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003409 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 }
Fred Drakee4315f52000-05-09 19:53:39 +00003411
Fred Drakee4315f52000-05-09 19:53:39 +00003412 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003413 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003414 if ((strcmp(lower, "utf-8") == 0) ||
3415 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003416 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003417 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003418 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003419 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003420 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003421 }
Victor Stinner37296e82010-06-10 13:36:23 +00003422 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003423 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003424 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003425 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003426#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003427 else if (strcmp(lower, "mbcs") == 0)
3428 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003429#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003430 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003431 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433
3434 /* Encode via the codec registry */
3435 v = PyCodec_Encode(unicode, encoding, errors);
3436 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003437 return NULL;
3438
3439 /* The normal path */
3440 if (PyBytes_Check(v))
3441 return v;
3442
3443 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003444 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003445 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003446 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003447
3448 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3449 "encoder %s returned bytearray instead of bytes",
3450 encoding);
3451 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003452 Py_DECREF(v);
3453 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003454 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003455
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003456 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3457 Py_DECREF(v);
3458 return b;
3459 }
3460
3461 PyErr_Format(PyExc_TypeError,
3462 "encoder did not return a bytes object (type=%.400s)",
3463 Py_TYPE(v)->tp_name);
3464 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003465 return NULL;
3466}
3467
Alexander Belopolsky40018472011-02-26 01:02:56 +00003468PyObject *
3469PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003470 const char *encoding,
3471 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003472{
3473 PyObject *v;
3474
3475 if (!PyUnicode_Check(unicode)) {
3476 PyErr_BadArgument();
3477 goto onError;
3478 }
3479
3480 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003482
3483 /* Encode via the codec registry */
3484 v = PyCodec_Encode(unicode, encoding, errors);
3485 if (v == NULL)
3486 goto onError;
3487 if (!PyUnicode_Check(v)) {
3488 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003489 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003490 Py_TYPE(v)->tp_name);
3491 Py_DECREF(v);
3492 goto onError;
3493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003495
Benjamin Peterson29060642009-01-31 22:14:21 +00003496 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 return NULL;
3498}
3499
Victor Stinner2f197072011-12-17 07:08:30 +01003500static size_t
3501mbstowcs_errorpos(const char *str, size_t len)
3502{
3503#ifdef HAVE_MBRTOWC
3504 const char *start = str;
3505 mbstate_t mbs;
3506 size_t converted;
3507 wchar_t ch;
3508
3509 memset(&mbs, 0, sizeof mbs);
3510 while (len)
3511 {
3512 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3513 if (converted == 0)
3514 /* Reached end of string */
3515 break;
3516 if (converted == (size_t)-1 || converted == (size_t)-2) {
3517 /* Conversion error or incomplete character */
3518 return str - start;
3519 }
3520 else {
3521 str += converted;
3522 len -= converted;
3523 }
3524 }
3525 /* failed to find the undecodable byte sequence */
3526 return 0;
3527#endif
3528 return 0;
3529}
3530
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003531PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003532PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003533 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003534{
3535 wchar_t smallbuf[256];
3536 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3537 wchar_t *wstr;
3538 size_t wlen, wlen2;
3539 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003540 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003541 size_t error_pos;
3542 char *errmsg;
3543 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003544
3545 if (locale_error_handler(errors, &surrogateescape) < 0)
3546 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547
3548 if (str[len] != '\0' || len != strlen(str)) {
3549 PyErr_SetString(PyExc_TypeError, "embedded null character");
3550 return NULL;
3551 }
3552
3553 if (surrogateescape)
3554 {
3555 wstr = _Py_char2wchar(str, &wlen);
3556 if (wstr == NULL) {
3557 if (wlen == (size_t)-1)
3558 PyErr_NoMemory();
3559 else
3560 PyErr_SetFromErrno(PyExc_OSError);
3561 return NULL;
3562 }
3563
3564 unicode = PyUnicode_FromWideChar(wstr, wlen);
3565 PyMem_Free(wstr);
3566 }
3567 else {
3568#ifndef HAVE_BROKEN_MBSTOWCS
3569 wlen = mbstowcs(NULL, str, 0);
3570#else
3571 wlen = len;
3572#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003573 if (wlen == (size_t)-1)
3574 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003575 if (wlen+1 <= smallbuf_len) {
3576 wstr = smallbuf;
3577 }
3578 else {
3579 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3580 return PyErr_NoMemory();
3581
3582 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3583 if (!wstr)
3584 return PyErr_NoMemory();
3585 }
3586
3587 /* This shouldn't fail now */
3588 wlen2 = mbstowcs(wstr, str, wlen+1);
3589 if (wlen2 == (size_t)-1) {
3590 if (wstr != smallbuf)
3591 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003592 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003593 }
3594#ifdef HAVE_BROKEN_MBSTOWCS
3595 assert(wlen2 == wlen);
3596#endif
3597 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3598 if (wstr != smallbuf)
3599 PyMem_Free(wstr);
3600 }
3601 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003602
3603decode_error:
3604 errmsg = strerror(errno);
3605 assert(errmsg != NULL);
3606
3607 error_pos = mbstowcs_errorpos(str, len);
3608 if (errmsg != NULL) {
3609 size_t errlen;
3610 wstr = _Py_char2wchar(errmsg, &errlen);
3611 if (wstr != NULL) {
3612 reason = PyUnicode_FromWideChar(wstr, errlen);
3613 PyMem_Free(wstr);
3614 } else
3615 errmsg = NULL;
3616 }
3617 if (errmsg == NULL)
3618 reason = PyUnicode_FromString(
3619 "mbstowcs() encountered an invalid multibyte sequence");
3620 if (reason == NULL)
3621 return NULL;
3622
3623 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3624 "locale", str, len,
3625 (Py_ssize_t)error_pos,
3626 (Py_ssize_t)(error_pos+1),
3627 reason);
3628 Py_DECREF(reason);
3629 if (exc != NULL) {
3630 PyCodec_StrictErrors(exc);
3631 Py_XDECREF(exc);
3632 }
3633 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003634}
3635
3636PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003637PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003638{
3639 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003640 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003641}
3642
3643
3644PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003645PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003646 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003647 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3648}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003649
Christian Heimes5894ba72007-11-04 11:43:14 +00003650PyObject*
3651PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3652{
Victor Stinner99b95382011-07-04 14:23:54 +02003653#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003654 return PyUnicode_DecodeMBCS(s, size, NULL);
3655#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003656 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003657#else
Victor Stinner793b5312011-04-27 00:24:21 +02003658 PyInterpreterState *interp = PyThreadState_GET()->interp;
3659 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3660 cannot use it to encode and decode filenames before it is loaded. Load
3661 the Python codec requires to encode at least its own filename. Use the C
3662 version of the locale codec until the codec registry is initialized and
3663 the Python codec is loaded.
3664
3665 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3666 cannot only rely on it: check also interp->fscodec_initialized for
3667 subinterpreters. */
3668 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003669 return PyUnicode_Decode(s, size,
3670 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003671 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003672 }
3673 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003674 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003675 }
Victor Stinnerad158722010-10-27 00:25:46 +00003676#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003677}
3678
Martin v. Löwis011e8422009-05-05 04:43:17 +00003679
3680int
Antoine Pitrou13348842012-01-29 18:36:34 +01003681_PyUnicode_HasNULChars(PyObject* s)
3682{
3683 static PyObject *nul = NULL;
3684
3685 if (nul == NULL)
3686 nul = PyUnicode_FromStringAndSize("\0", 1);
3687 if (nul == NULL)
3688 return -1;
3689 return PyUnicode_Contains(s, nul);
3690}
3691
3692
3693int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003694PyUnicode_FSConverter(PyObject* arg, void* addr)
3695{
3696 PyObject *output = NULL;
3697 Py_ssize_t size;
3698 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003699 if (arg == NULL) {
3700 Py_DECREF(*(PyObject**)addr);
3701 return 1;
3702 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003703 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003704 output = arg;
3705 Py_INCREF(output);
3706 }
3707 else {
3708 arg = PyUnicode_FromObject(arg);
3709 if (!arg)
3710 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003711 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003712 Py_DECREF(arg);
3713 if (!output)
3714 return 0;
3715 if (!PyBytes_Check(output)) {
3716 Py_DECREF(output);
3717 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3718 return 0;
3719 }
3720 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003721 size = PyBytes_GET_SIZE(output);
3722 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003723 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003724 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003725 Py_DECREF(output);
3726 return 0;
3727 }
3728 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003729 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003730}
3731
3732
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003733int
3734PyUnicode_FSDecoder(PyObject* arg, void* addr)
3735{
3736 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003737 if (arg == NULL) {
3738 Py_DECREF(*(PyObject**)addr);
3739 return 1;
3740 }
3741 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003742 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003744 output = arg;
3745 Py_INCREF(output);
3746 }
3747 else {
3748 arg = PyBytes_FromObject(arg);
3749 if (!arg)
3750 return 0;
3751 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3752 PyBytes_GET_SIZE(arg));
3753 Py_DECREF(arg);
3754 if (!output)
3755 return 0;
3756 if (!PyUnicode_Check(output)) {
3757 Py_DECREF(output);
3758 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3759 return 0;
3760 }
3761 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003762 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003763 Py_DECREF(output);
3764 return 0;
3765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003767 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003768 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3769 Py_DECREF(output);
3770 return 0;
3771 }
3772 *(PyObject**)addr = output;
3773 return Py_CLEANUP_SUPPORTED;
3774}
3775
3776
Martin v. Löwis5b222132007-06-10 09:51:05 +00003777char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003779{
Christian Heimesf3863112007-11-22 07:46:41 +00003780 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003782 if (!PyUnicode_Check(unicode)) {
3783 PyErr_BadArgument();
3784 return NULL;
3785 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003786 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003787 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003789 if (PyUnicode_UTF8(unicode) == NULL) {
3790 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3792 if (bytes == NULL)
3793 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003794 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3795 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 Py_DECREF(bytes);
3797 return NULL;
3798 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003799 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3800 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3801 PyBytes_AS_STRING(bytes),
3802 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803 Py_DECREF(bytes);
3804 }
3805
3806 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003807 *psize = PyUnicode_UTF8_LENGTH(unicode);
3808 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003809}
3810
3811char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3815}
3816
3817#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003818static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819#endif
3820
3821
3822Py_UNICODE *
3823PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 const unsigned char *one_byte;
3826#if SIZEOF_WCHAR_T == 4
3827 const Py_UCS2 *two_bytes;
3828#else
3829 const Py_UCS4 *four_bytes;
3830 const Py_UCS4 *ucs4_end;
3831 Py_ssize_t num_surrogates;
3832#endif
3833 wchar_t *w;
3834 wchar_t *wchar_end;
3835
3836 if (!PyUnicode_Check(unicode)) {
3837 PyErr_BadArgument();
3838 return NULL;
3839 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003842 assert(_PyUnicode_KIND(unicode) != 0);
3843 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844
3845#ifdef Py_DEBUG
3846 ++unicode_as_unicode_calls;
3847#endif
3848
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003849 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003851 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3852 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 num_surrogates = 0;
3854
3855 for (; four_bytes < ucs4_end; ++four_bytes) {
3856 if (*four_bytes > 0xFFFF)
3857 ++num_surrogates;
3858 }
3859
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3861 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3862 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 PyErr_NoMemory();
3864 return NULL;
3865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003868 w = _PyUnicode_WSTR(unicode);
3869 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3870 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3872 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003873 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003875 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3876 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 }
3878 else
3879 *w = *four_bytes;
3880
3881 if (w > wchar_end) {
3882 assert(0 && "Miscalculated string end");
3883 }
3884 }
3885 *w = 0;
3886#else
3887 /* sizeof(wchar_t) == 4 */
3888 Py_FatalError("Impossible unicode object state, wstr and str "
3889 "should share memory already.");
3890 return NULL;
3891#endif
3892 }
3893 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3895 (_PyUnicode_LENGTH(unicode) + 1));
3896 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 PyErr_NoMemory();
3898 return NULL;
3899 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3901 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3902 w = _PyUnicode_WSTR(unicode);
3903 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003905 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3906 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 for (; w < wchar_end; ++one_byte, ++w)
3908 *w = *one_byte;
3909 /* null-terminate the wstr */
3910 *w = 0;
3911 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003912 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003914 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 for (; w < wchar_end; ++two_bytes, ++w)
3916 *w = *two_bytes;
3917 /* null-terminate the wstr */
3918 *w = 0;
3919#else
3920 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 PyObject_FREE(_PyUnicode_WSTR(unicode));
3922 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 Py_FatalError("Impossible unicode object state, wstr "
3924 "and str should share memory already.");
3925 return NULL;
3926#endif
3927 }
3928 else {
3929 assert(0 && "This should never happen.");
3930 }
3931 }
3932 }
3933 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003934 *size = PyUnicode_WSTR_LENGTH(unicode);
3935 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003936}
3937
Alexander Belopolsky40018472011-02-26 01:02:56 +00003938Py_UNICODE *
3939PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942}
3943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944
Alexander Belopolsky40018472011-02-26 01:02:56 +00003945Py_ssize_t
3946PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947{
3948 if (!PyUnicode_Check(unicode)) {
3949 PyErr_BadArgument();
3950 goto onError;
3951 }
3952 return PyUnicode_GET_SIZE(unicode);
3953
Benjamin Peterson29060642009-01-31 22:14:21 +00003954 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 return -1;
3956}
3957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958Py_ssize_t
3959PyUnicode_GetLength(PyObject *unicode)
3960{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003961 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 PyErr_BadArgument();
3963 return -1;
3964 }
3965
3966 return PyUnicode_GET_LENGTH(unicode);
3967}
3968
3969Py_UCS4
3970PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3971{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003972 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3973 PyErr_BadArgument();
3974 return (Py_UCS4)-1;
3975 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003976 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003977 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 return (Py_UCS4)-1;
3979 }
3980 return PyUnicode_READ_CHAR(unicode, index);
3981}
3982
3983int
3984PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3985{
3986 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003987 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988 return -1;
3989 }
Victor Stinner488fa492011-12-12 00:01:39 +01003990 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003991 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003992 PyErr_SetString(PyExc_IndexError, "string index out of range");
3993 return -1;
3994 }
Victor Stinner488fa492011-12-12 00:01:39 +01003995 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003996 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003997 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3998 PyErr_SetString(PyExc_ValueError, "character out of range");
3999 return -1;
4000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4002 index, ch);
4003 return 0;
4004}
4005
Alexander Belopolsky40018472011-02-26 01:02:56 +00004006const char *
4007PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004008{
Victor Stinner42cb4622010-09-01 19:39:01 +00004009 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004010}
4011
Victor Stinner554f3f02010-06-16 23:33:54 +00004012/* create or adjust a UnicodeDecodeError */
4013static void
4014make_decode_exception(PyObject **exceptionObject,
4015 const char *encoding,
4016 const char *input, Py_ssize_t length,
4017 Py_ssize_t startpos, Py_ssize_t endpos,
4018 const char *reason)
4019{
4020 if (*exceptionObject == NULL) {
4021 *exceptionObject = PyUnicodeDecodeError_Create(
4022 encoding, input, length, startpos, endpos, reason);
4023 }
4024 else {
4025 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4026 goto onError;
4027 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4028 goto onError;
4029 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4030 goto onError;
4031 }
4032 return;
4033
4034onError:
4035 Py_DECREF(*exceptionObject);
4036 *exceptionObject = NULL;
4037}
4038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039/* error handling callback helper:
4040 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004041 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 and adjust various state variables.
4043 return 0 on success, -1 on error
4044*/
4045
Alexander Belopolsky40018472011-02-26 01:02:56 +00004046static int
4047unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004048 const char *encoding, const char *reason,
4049 const char **input, const char **inend, Py_ssize_t *startinpos,
4050 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004051 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004053 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
4055 PyObject *restuple = NULL;
4056 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004057 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004058 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t requiredsize;
4060 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004061 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 int res = -1;
4063
Victor Stinner596a6c42011-11-09 00:02:18 +01004064 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4065 outsize = PyUnicode_GET_LENGTH(*output);
4066 else
4067 outsize = _PyUnicode_WSTR_LENGTH(*output);
4068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 *errorHandler = PyCodec_LookupError(errors);
4071 if (*errorHandler == NULL)
4072 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 }
4074
Victor Stinner554f3f02010-06-16 23:33:54 +00004075 make_decode_exception(exceptionObject,
4076 encoding,
4077 *input, *inend - *input,
4078 *startinpos, *endinpos,
4079 reason);
4080 if (*exceptionObject == NULL)
4081 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082
4083 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4084 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004087 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 }
4090 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004092 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004093 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004094
4095 /* Copy back the bytes variables, which might have been modified by the
4096 callback */
4097 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4098 if (!inputobj)
4099 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004100 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004102 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004103 *input = PyBytes_AS_STRING(inputobj);
4104 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004105 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004106 /* we can DECREF safely, as the exception has another reference,
4107 so the object won't go away. */
4108 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004112 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4114 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004115 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116
Victor Stinner596a6c42011-11-09 00:02:18 +01004117 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4118 /* need more space? (at least enough for what we
4119 have+the replacement+the rest of the string (starting
4120 at the new input position), so we won't have to check space
4121 when there are no errors in the rest of the string) */
4122 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4123 requiredsize = *outpos + replen + insize-newpos;
4124 if (requiredsize > outsize) {
4125 if (requiredsize<2*outsize)
4126 requiredsize = 2*outsize;
4127 if (unicode_resize(output, requiredsize) < 0)
4128 goto onError;
4129 }
4130 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004132 copy_characters(*output, *outpos, repunicode, 0, replen);
4133 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004135 else {
4136 wchar_t *repwstr;
4137 Py_ssize_t repwlen;
4138 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4139 if (repwstr == NULL)
4140 goto onError;
4141 /* need more space? (at least enough for what we
4142 have+the replacement+the rest of the string (starting
4143 at the new input position), so we won't have to check space
4144 when there are no errors in the rest of the string) */
4145 requiredsize = *outpos + repwlen + insize-newpos;
4146 if (requiredsize > outsize) {
4147 if (requiredsize < 2*outsize)
4148 requiredsize = 2*outsize;
4149 if (unicode_resize(output, requiredsize) < 0)
4150 goto onError;
4151 }
4152 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4153 *outpos += repwlen;
4154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004156 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 /* we made it! */
4159 res = 0;
4160
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 Py_XDECREF(restuple);
4163 return res;
4164}
4165
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004166/* --- UTF-7 Codec -------------------------------------------------------- */
4167
Antoine Pitrou244651a2009-05-04 18:56:13 +00004168/* See RFC2152 for details. We encode conservatively and decode liberally. */
4169
4170/* Three simple macros defining base-64. */
4171
4172/* Is c a base-64 character? */
4173
4174#define IS_BASE64(c) \
4175 (((c) >= 'A' && (c) <= 'Z') || \
4176 ((c) >= 'a' && (c) <= 'z') || \
4177 ((c) >= '0' && (c) <= '9') || \
4178 (c) == '+' || (c) == '/')
4179
4180/* given that c is a base-64 character, what is its base-64 value? */
4181
4182#define FROM_BASE64(c) \
4183 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4184 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4185 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4186 (c) == '+' ? 62 : 63)
4187
4188/* What is the base-64 character of the bottom 6 bits of n? */
4189
4190#define TO_BASE64(n) \
4191 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4192
4193/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4194 * decoded as itself. We are permissive on decoding; the only ASCII
4195 * byte not decoding to itself is the + which begins a base64
4196 * string. */
4197
4198#define DECODE_DIRECT(c) \
4199 ((c) <= 127 && (c) != '+')
4200
4201/* The UTF-7 encoder treats ASCII characters differently according to
4202 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4203 * the above). See RFC2152. This array identifies these different
4204 * sets:
4205 * 0 : "Set D"
4206 * alphanumeric and '(),-./:?
4207 * 1 : "Set O"
4208 * !"#$%&*;<=>@[]^_`{|}
4209 * 2 : "whitespace"
4210 * ht nl cr sp
4211 * 3 : special (must be base64 encoded)
4212 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4213 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004214
Tim Petersced69f82003-09-16 20:30:58 +00004215static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004216char utf7_category[128] = {
4217/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4218 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4219/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4220 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4221/* sp ! " # $ % & ' ( ) * + , - . / */
4222 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4223/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4225/* @ A B C D E F G H I J K L M N O */
4226 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4227/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4229/* ` a b c d e f g h i j k l m n o */
4230 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4231/* p q r s t u v w x y z { | } ~ del */
4232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004233};
4234
Antoine Pitrou244651a2009-05-04 18:56:13 +00004235/* ENCODE_DIRECT: this character should be encoded as itself. The
4236 * answer depends on whether we are encoding set O as itself, and also
4237 * on whether we are encoding whitespace as itself. RFC2152 makes it
4238 * clear that the answers to these questions vary between
4239 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004240
Antoine Pitrou244651a2009-05-04 18:56:13 +00004241#define ENCODE_DIRECT(c, directO, directWS) \
4242 ((c) < 128 && (c) > 0 && \
4243 ((utf7_category[(c)] == 0) || \
4244 (directWS && (utf7_category[(c)] == 2)) || \
4245 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004246
Alexander Belopolsky40018472011-02-26 01:02:56 +00004247PyObject *
4248PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004249 Py_ssize_t size,
4250 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004251{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004252 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4253}
4254
Antoine Pitrou244651a2009-05-04 18:56:13 +00004255/* The decoder. The only state we preserve is our read position,
4256 * i.e. how many characters we have consumed. So if we end in the
4257 * middle of a shift sequence we have to back off the read position
4258 * and the output to the beginning of the sequence, otherwise we lose
4259 * all the shift state (seen bits, number of bits seen, high
4260 * surrogate). */
4261
Alexander Belopolsky40018472011-02-26 01:02:56 +00004262PyObject *
4263PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004264 Py_ssize_t size,
4265 const char *errors,
4266 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004269 Py_ssize_t startinpos;
4270 Py_ssize_t endinpos;
4271 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004272 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004273 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004274 const char *errmsg = "";
4275 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004276 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277 unsigned int base64bits = 0;
4278 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004279 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 PyObject *errorHandler = NULL;
4281 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004283 /* Start off assuming it's all ASCII. Widen later as necessary. */
4284 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285 if (!unicode)
4286 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004287 if (size == 0) {
4288 if (consumed)
4289 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004290 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004291 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004293 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294 e = s + size;
4295
4296 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004297 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004298 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004299 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004300
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301 if (inShift) { /* in a base-64 section */
4302 if (IS_BASE64(ch)) { /* consume a base-64 character */
4303 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4304 base64bits += 6;
4305 s++;
4306 if (base64bits >= 16) {
4307 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004308 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 base64bits -= 16;
4310 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4311 if (surrogate) {
4312 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004313 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4314 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004315 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4316 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004317 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004318 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 }
4320 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004321 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4322 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004323 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 }
4325 }
Victor Stinner551ac952011-11-29 22:58:13 +01004326 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004327 /* first surrogate */
4328 surrogate = outCh;
4329 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004330 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4332 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004333 }
4334 }
4335 }
4336 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004337 inShift = 0;
4338 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004339 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004340 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4341 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004342 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344 if (base64bits > 0) { /* left-over bits */
4345 if (base64bits >= 6) {
4346 /* We've seen at least one base-64 character */
4347 errmsg = "partial character in shift sequence";
4348 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 else {
4351 /* Some bits remain; they should be zero */
4352 if (base64buffer != 0) {
4353 errmsg = "non-zero padding bits in shift sequence";
4354 goto utf7Error;
4355 }
4356 }
4357 }
4358 if (ch != '-') {
4359 /* '-' is absorbed; other terminating
4360 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004361 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4362 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004363 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004364 }
4365 }
4366 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 s++; /* consume '+' */
4369 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004371 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4372 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 }
4374 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004376 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 }
4379 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004381 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4382 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 s++;
4384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 else {
4386 startinpos = s-starts;
4387 s++;
4388 errmsg = "unexpected special character";
4389 goto utf7Error;
4390 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004391 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 endinpos = s-starts;
4394 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 errors, &errorHandler,
4396 "utf7", errmsg,
4397 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004398 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400 }
4401
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 /* end of string */
4403
4404 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4405 /* if we're in an inconsistent state, that's an error */
4406 if (surrogate ||
4407 (base64bits >= 6) ||
4408 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 endinpos = size;
4410 if (unicode_decode_call_errorhandler(
4411 errors, &errorHandler,
4412 "utf7", "unterminated shift sequence",
4413 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 goto onError;
4416 if (s < e)
4417 goto restart;
4418 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420
4421 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004422 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004424 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004425 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 }
4427 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004428 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004430 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 goto onError;
4434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 Py_XDECREF(errorHandler);
4436 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004437 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 Py_XDECREF(errorHandler);
4441 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 Py_DECREF(unicode);
4443 return NULL;
4444}
4445
4446
Alexander Belopolsky40018472011-02-26 01:02:56 +00004447PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004448_PyUnicode_EncodeUTF7(PyObject *str,
4449 int base64SetO,
4450 int base64WhiteSpace,
4451 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004453 int kind;
4454 void *data;
4455 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004456 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004457 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004459 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 unsigned int base64bits = 0;
4461 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 char * out;
4463 char * start;
4464
Benjamin Petersonbac79492012-01-14 13:34:47 -05004465 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004466 return NULL;
4467 kind = PyUnicode_KIND(str);
4468 data = PyUnicode_DATA(str);
4469 len = PyUnicode_GET_LENGTH(str);
4470
4471 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004474 /* It might be possible to tighten this worst case */
4475 allocated = 8 * len;
4476 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004477 return PyErr_NoMemory();
4478
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 if (v == NULL)
4481 return NULL;
4482
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004483 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004484 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004485 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 if (inShift) {
4488 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4489 /* shifting out */
4490 if (base64bits) { /* output remaining bits */
4491 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4492 base64buffer = 0;
4493 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494 }
4495 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 /* Characters not in the BASE64 set implicitly unshift the sequence
4497 so no '-' is required, except if the character is itself a '-' */
4498 if (IS_BASE64(ch) || ch == '-') {
4499 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 *out++ = (char) ch;
4502 }
4503 else {
4504 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004505 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 else { /* not in a shift sequence */
4508 if (ch == '+') {
4509 *out++ = '+';
4510 *out++ = '-';
4511 }
4512 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4513 *out++ = (char) ch;
4514 }
4515 else {
4516 *out++ = '+';
4517 inShift = 1;
4518 goto encode_char;
4519 }
4520 }
4521 continue;
4522encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004524 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004525
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526 /* code first surrogate */
4527 base64bits += 16;
4528 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4529 while (base64bits >= 6) {
4530 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4531 base64bits -= 6;
4532 }
4533 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004534 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 base64bits += 16;
4537 base64buffer = (base64buffer << 16) | ch;
4538 while (base64bits >= 6) {
4539 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4540 base64bits -= 6;
4541 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004542 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 if (base64bits)
4544 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4545 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004547 if (_PyBytes_Resize(&v, out - start) < 0)
4548 return NULL;
4549 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004551PyObject *
4552PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4553 Py_ssize_t size,
4554 int base64SetO,
4555 int base64WhiteSpace,
4556 const char *errors)
4557{
4558 PyObject *result;
4559 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4560 if (tmp == NULL)
4561 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004562 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004563 base64WhiteSpace, errors);
4564 Py_DECREF(tmp);
4565 return result;
4566}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568#undef IS_BASE64
4569#undef FROM_BASE64
4570#undef TO_BASE64
4571#undef DECODE_DIRECT
4572#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574/* --- UTF-8 Codec -------------------------------------------------------- */
4575
Tim Petersced69f82003-09-16 20:30:58 +00004576static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004578 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4579 illegal prefix. See RFC 3629 for details */
4580 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4581 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4588 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4592 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4593 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4594 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4595 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596};
4597
Alexander Belopolsky40018472011-02-26 01:02:56 +00004598PyObject *
4599PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004600 Py_ssize_t size,
4601 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602{
Walter Dörwald69652032004-09-07 20:24:22 +00004603 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4604}
4605
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004606#include "stringlib/ucs1lib.h"
4607#include "stringlib/codecs.h"
4608#include "stringlib/undef.h"
4609
4610#include "stringlib/ucs2lib.h"
4611#include "stringlib/codecs.h"
4612#include "stringlib/undef.h"
4613
4614#include "stringlib/ucs4lib.h"
4615#include "stringlib/codecs.h"
4616#include "stringlib/undef.h"
4617
Antoine Pitrouab868312009-01-10 15:40:25 +00004618/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4619#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4620
4621/* Mask to quickly check whether a C 'long' contains a
4622 non-ASCII, UTF8-encoded char. */
4623#if (SIZEOF_LONG == 8)
4624# define ASCII_CHAR_MASK 0x8080808080808080L
4625#elif (SIZEOF_LONG == 4)
4626# define ASCII_CHAR_MASK 0x80808080L
4627#else
4628# error C 'long' size should be either 4 or 8!
4629#endif
4630
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004631/* Scans a UTF-8 string and returns the maximum character to be expected
4632 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004633
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004634 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004635 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004636 */
4637static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004638utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004640 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004641 const unsigned char *end = p + string_size;
4642 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004643
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004644 assert(unicode_size != NULL);
4645
4646 /* By having a cascade of independent loops which fallback onto each
4647 other, we minimize the amount of work done in the average loop
4648 iteration, and we also maximize the CPU's ability to predict
4649 branches correctly (because a given condition will have always the
4650 same boolean outcome except perhaps in the last iteration of the
4651 corresponding loop).
4652 In the general case this brings us rather close to decoding
4653 performance pre-PEP 393, despite the two-pass decoding.
4654
4655 Note that the pure ASCII loop is not duplicated once a non-ASCII
4656 character has been encountered. It is actually a pessimization (by
4657 a significant factor) to use this loop on text with many non-ASCII
4658 characters, and it is important to avoid bad performance on valid
4659 utf-8 data (invalid utf-8 being a different can of worms).
4660 */
4661
4662 /* ASCII */
4663 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004664 /* Only check value if it's not a ASCII char... */
4665 if (*p < 0x80) {
4666 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4667 an explanation. */
4668 if (!((size_t) p & LONG_PTR_MASK)) {
4669 /* Help register allocation */
4670 register const unsigned char *_p = p;
4671 while (_p < aligned_end) {
4672 unsigned long value = *(unsigned long *) _p;
4673 if (value & ASCII_CHAR_MASK)
4674 break;
4675 _p += SIZEOF_LONG;
4676 char_count += SIZEOF_LONG;
4677 }
4678 p = _p;
4679 if (p == end)
4680 break;
4681 }
4682 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004683 if (*p < 0x80)
4684 ++char_count;
4685 else
4686 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004687 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004688 *unicode_size = char_count;
4689 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004690
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004691_ucs1loop:
4692 for (; p < end; ++p) {
4693 if (*p < 0xc4)
4694 char_count += ((*p & 0xc0) != 0x80);
4695 else
4696 goto _ucs2loop;
4697 }
4698 *unicode_size = char_count;
4699 return 255;
4700
4701_ucs2loop:
4702 for (; p < end; ++p) {
4703 if (*p < 0xf0)
4704 char_count += ((*p & 0xc0) != 0x80);
4705 else
4706 goto _ucs4loop;
4707 }
4708 *unicode_size = char_count;
4709 return 65535;
4710
4711_ucs4loop:
4712 for (; p < end; ++p) {
4713 char_count += ((*p & 0xc0) != 0x80);
4714 }
4715 *unicode_size = char_count;
4716 return 65537;
4717}
4718
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004719/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004720 in case of errors. Implicit parameters: unicode, kind, data, onError.
4721 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004722*/
Victor Stinner785938e2011-12-11 20:09:03 +01004723#define WRITE_MAYBE_FAIL(index, value) \
4724 do { \
4725 Py_ssize_t pos = index; \
4726 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4727 unicode_resize(&unicode, pos + pos/8) < 0) \
4728 goto onError; \
4729 if (unicode_putchar(&unicode, &pos, value) < 0) \
4730 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731 } while (0)
4732
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004733static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004734decode_utf8_errors(const char *starts,
4735 Py_ssize_t size,
4736 const char *errors,
4737 Py_ssize_t *consumed,
4738 const char *s,
4739 PyObject *unicode,
4740 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004741{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004743 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004744 Py_ssize_t startinpos;
4745 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004746 const char *e = starts + size;
4747 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004748 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 PyObject *errorHandler = NULL;
4750 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004751
Antoine Pitrouab868312009-01-10 15:40:25 +00004752 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
4754 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004755 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756
4757 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004758 /* Fast path for runs of ASCII characters. Given that common UTF-8
4759 input will consist of an overwhelming majority of ASCII
4760 characters, we try to optimize for this case by checking
4761 as many characters as a C 'long' can contain.
4762 First, check if we can do an aligned read, as most CPUs have
4763 a penalty for unaligned reads.
4764 */
4765 if (!((size_t) s & LONG_PTR_MASK)) {
4766 /* Help register allocation */
4767 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004769 while (_s < aligned_end) {
4770 /* Read a whole long at a time (either 4 or 8 bytes),
4771 and do a fast unrolled copy if it only contains ASCII
4772 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773 unsigned long value = *(unsigned long *) _s;
4774 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004775 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004776 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4777 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4778 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4779 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004780#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004781 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4782 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4783 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4784 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004785#endif
4786 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004787 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004788 }
4789 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004790 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004791 if (s == e)
4792 break;
4793 ch = (unsigned char)*s;
4794 }
4795 }
4796
4797 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004798 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 s++;
4800 continue;
4801 }
4802
4803 n = utf8_code_length[ch];
4804
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004805 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004806 if (consumed)
4807 break;
4808 else {
4809 errmsg = "unexpected end of data";
4810 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004811 endinpos = startinpos+1;
4812 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4813 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 goto utf8Error;
4815 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817
4818 switch (n) {
4819
4820 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004821 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 startinpos = s-starts;
4823 endinpos = startinpos+1;
4824 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
4826 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004827 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 startinpos = s-starts;
4829 endinpos = startinpos+1;
4830 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831
4832 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004833 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004834 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004836 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004837 goto utf8Error;
4838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004840 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004841 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 break;
4843
4844 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004845 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4846 will result in surrogates in range d800-dfff. Surrogates are
4847 not valid UTF-8 so they are rejected.
4848 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4849 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004850 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004851 (s[2] & 0xc0) != 0x80 ||
4852 ((unsigned char)s[0] == 0xE0 &&
4853 (unsigned char)s[1] < 0xA0) ||
4854 ((unsigned char)s[0] == 0xED &&
4855 (unsigned char)s[1] > 0x9F)) {
4856 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004858 endinpos = startinpos + 1;
4859
4860 /* if s[1] first two bits are 1 and 0, then the invalid
4861 continuation byte is s[2], so increment endinpos by 1,
4862 if not, s[1] is invalid and endinpos doesn't need to
4863 be incremented. */
4864 if ((s[1] & 0xC0) == 0x80)
4865 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 goto utf8Error;
4867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004869 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004870 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004871 break;
4872
4873 case 4:
4874 if ((s[1] & 0xc0) != 0x80 ||
4875 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004876 (s[3] & 0xc0) != 0x80 ||
4877 ((unsigned char)s[0] == 0xF0 &&
4878 (unsigned char)s[1] < 0x90) ||
4879 ((unsigned char)s[0] == 0xF4 &&
4880 (unsigned char)s[1] > 0x8F)) {
4881 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004883 endinpos = startinpos + 1;
4884 if ((s[1] & 0xC0) == 0x80) {
4885 endinpos++;
4886 if ((s[2] & 0xC0) == 0x80)
4887 endinpos++;
4888 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004889 goto utf8Error;
4890 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004891 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004892 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004893 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004894
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004895 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 }
4898 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004900
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 if (unicode_decode_call_errorhandler(
4903 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004904 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004905 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004906 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004908 /* Update data because unicode_decode_call_errorhandler might have
4909 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 }
Walter Dörwald69652032004-09-07 20:24:22 +00004912 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004915 /* Adjust length and ready string when it contained errors and
4916 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004917 if (unicode_resize(&unicode, i) < 0)
4918 goto onError;
4919 unicode_adjust_maxchar(&unicode);
4920 if (unicode == NULL)
4921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 Py_XDECREF(errorHandler);
4924 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004925 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004926 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 Py_XDECREF(errorHandler);
4930 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004931 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 return NULL;
4933}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004934#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004935
Victor Stinner785938e2011-12-11 20:09:03 +01004936PyObject *
4937PyUnicode_DecodeUTF8Stateful(const char *s,
4938 Py_ssize_t size,
4939 const char *errors,
4940 Py_ssize_t *consumed)
4941{
4942 Py_UCS4 maxchar = 0;
4943 Py_ssize_t unicode_size;
4944 int has_errors = 0;
4945 PyObject *unicode;
4946 int kind;
4947 void *data;
4948 const char *starts = s;
4949 const char *e;
4950 Py_ssize_t i;
4951
4952 if (size == 0) {
4953 if (consumed)
4954 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004955 Py_INCREF(unicode_empty);
4956 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004957 }
4958
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004959 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004960
4961 /* When the string is ASCII only, just use memcpy and return.
4962 unicode_size may be != size if there is an incomplete UTF-8
4963 sequence at the end of the ASCII block. */
4964 if (maxchar < 128 && size == unicode_size) {
4965 if (consumed)
4966 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004967 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004968 }
4969
4970 unicode = PyUnicode_New(unicode_size, maxchar);
4971 if (!unicode)
4972 return NULL;
4973 kind = PyUnicode_KIND(unicode);
4974 data = PyUnicode_DATA(unicode);
4975
4976 /* Unpack UTF-8 encoded data */
4977 i = 0;
4978 e = starts + size;
4979 switch (kind) {
4980 case PyUnicode_1BYTE_KIND:
4981 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4982 break;
4983 case PyUnicode_2BYTE_KIND:
4984 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4985 break;
4986 case PyUnicode_4BYTE_KIND:
4987 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4988 break;
4989 }
4990 if (!has_errors) {
4991 /* Ensure the unicode size calculation was correct */
4992 assert(i == unicode_size);
4993 assert(s == e);
4994 if (consumed)
4995 *consumed = size;
4996 return unicode;
4997 }
4998
4999 /* In case of errors, maxchar and size computation might be incorrect;
5000 code below refits and resizes as necessary. */
5001 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
5002}
5003
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004#ifdef __APPLE__
5005
5006/* Simplified UTF-8 decoder using surrogateescape error handler,
5007 used to decode the command line arguments on Mac OS X. */
5008
5009wchar_t*
5010_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5011{
5012 int n;
5013 const char *e;
5014 wchar_t *unicode, *p;
5015
5016 /* Note: size will always be longer than the resulting Unicode
5017 character count */
5018 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5019 PyErr_NoMemory();
5020 return NULL;
5021 }
5022 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5023 if (!unicode)
5024 return NULL;
5025
5026 /* Unpack UTF-8 encoded data */
5027 p = unicode;
5028 e = s + size;
5029 while (s < e) {
5030 Py_UCS4 ch = (unsigned char)*s;
5031
5032 if (ch < 0x80) {
5033 *p++ = (wchar_t)ch;
5034 s++;
5035 continue;
5036 }
5037
5038 n = utf8_code_length[ch];
5039 if (s + n > e) {
5040 goto surrogateescape;
5041 }
5042
5043 switch (n) {
5044 case 0:
5045 case 1:
5046 goto surrogateescape;
5047
5048 case 2:
5049 if ((s[1] & 0xc0) != 0x80)
5050 goto surrogateescape;
5051 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5052 assert ((ch > 0x007F) && (ch <= 0x07FF));
5053 *p++ = (wchar_t)ch;
5054 break;
5055
5056 case 3:
5057 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5058 will result in surrogates in range d800-dfff. Surrogates are
5059 not valid UTF-8 so they are rejected.
5060 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5061 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5062 if ((s[1] & 0xc0) != 0x80 ||
5063 (s[2] & 0xc0) != 0x80 ||
5064 ((unsigned char)s[0] == 0xE0 &&
5065 (unsigned char)s[1] < 0xA0) ||
5066 ((unsigned char)s[0] == 0xED &&
5067 (unsigned char)s[1] > 0x9F)) {
5068
5069 goto surrogateescape;
5070 }
5071 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5072 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005073 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074 break;
5075
5076 case 4:
5077 if ((s[1] & 0xc0) != 0x80 ||
5078 (s[2] & 0xc0) != 0x80 ||
5079 (s[3] & 0xc0) != 0x80 ||
5080 ((unsigned char)s[0] == 0xF0 &&
5081 (unsigned char)s[1] < 0x90) ||
5082 ((unsigned char)s[0] == 0xF4 &&
5083 (unsigned char)s[1] > 0x8F)) {
5084 goto surrogateescape;
5085 }
5086 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5087 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005088 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005089
5090#if SIZEOF_WCHAR_T == 4
5091 *p++ = (wchar_t)ch;
5092#else
5093 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005094 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5095 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005096#endif
5097 break;
5098 }
5099 s += n;
5100 continue;
5101
5102 surrogateescape:
5103 *p++ = 0xDC00 + ch;
5104 s++;
5105 }
5106 *p = L'\0';
5107 return unicode;
5108}
5109
5110#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005112/* Primary internal function which creates utf8 encoded bytes objects.
5113
5114 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005115 and allocate exactly as much space needed at the end. Else allocate the
5116 maximum possible needed (4 result bytes per Unicode character), and return
5117 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005118*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005119PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005120_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121{
Victor Stinner6099a032011-12-18 14:22:26 +01005122 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123 void *data;
5124 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 if (!PyUnicode_Check(unicode)) {
5127 PyErr_BadArgument();
5128 return NULL;
5129 }
5130
5131 if (PyUnicode_READY(unicode) == -1)
5132 return NULL;
5133
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005134 if (PyUnicode_UTF8(unicode))
5135 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5136 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005137
5138 kind = PyUnicode_KIND(unicode);
5139 data = PyUnicode_DATA(unicode);
5140 size = PyUnicode_GET_LENGTH(unicode);
5141
Benjamin Petersonead6b532011-12-20 17:23:42 -06005142 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005143 default:
5144 assert(0);
5145 case PyUnicode_1BYTE_KIND:
5146 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5147 assert(!PyUnicode_IS_ASCII(unicode));
5148 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5149 case PyUnicode_2BYTE_KIND:
5150 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5151 case PyUnicode_4BYTE_KIND:
5152 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154}
5155
Alexander Belopolsky40018472011-02-26 01:02:56 +00005156PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005157PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5158 Py_ssize_t size,
5159 const char *errors)
5160{
5161 PyObject *v, *unicode;
5162
5163 unicode = PyUnicode_FromUnicode(s, size);
5164 if (unicode == NULL)
5165 return NULL;
5166 v = _PyUnicode_AsUTF8String(unicode, errors);
5167 Py_DECREF(unicode);
5168 return v;
5169}
5170
5171PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005172PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005174 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175}
5176
Walter Dörwald41980ca2007-08-16 21:55:45 +00005177/* --- UTF-32 Codec ------------------------------------------------------- */
5178
5179PyObject *
5180PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 Py_ssize_t size,
5182 const char *errors,
5183 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005184{
5185 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5186}
5187
5188PyObject *
5189PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005190 Py_ssize_t size,
5191 const char *errors,
5192 int *byteorder,
5193 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005194{
5195 const char *starts = s;
5196 Py_ssize_t startinpos;
5197 Py_ssize_t endinpos;
5198 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005199 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005200 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005201 int bo = 0; /* assume native ordering by default */
5202 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005203 /* Offsets from q for retrieving bytes in the right order. */
5204#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5205 int iorder[] = {0, 1, 2, 3};
5206#else
5207 int iorder[] = {3, 2, 1, 0};
5208#endif
5209 PyObject *errorHandler = NULL;
5210 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005211
Walter Dörwald41980ca2007-08-16 21:55:45 +00005212 q = (unsigned char *)s;
5213 e = q + size;
5214
5215 if (byteorder)
5216 bo = *byteorder;
5217
5218 /* Check for BOM marks (U+FEFF) in the input and adjust current
5219 byte order setting accordingly. In native mode, the leading BOM
5220 mark is skipped, in all other modes, it is copied to the output
5221 stream as-is (giving a ZWNBSP character). */
5222 if (bo == 0) {
5223 if (size >= 4) {
5224 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005225 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 if (bom == 0x0000FEFF) {
5228 q += 4;
5229 bo = -1;
5230 }
5231 else if (bom == 0xFFFE0000) {
5232 q += 4;
5233 bo = 1;
5234 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 if (bom == 0x0000FEFF) {
5237 q += 4;
5238 bo = 1;
5239 }
5240 else if (bom == 0xFFFE0000) {
5241 q += 4;
5242 bo = -1;
5243 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246 }
5247
5248 if (bo == -1) {
5249 /* force LE */
5250 iorder[0] = 0;
5251 iorder[1] = 1;
5252 iorder[2] = 2;
5253 iorder[3] = 3;
5254 }
5255 else if (bo == 1) {
5256 /* force BE */
5257 iorder[0] = 3;
5258 iorder[1] = 2;
5259 iorder[2] = 1;
5260 iorder[3] = 0;
5261 }
5262
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005263 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005264 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005265 if (!unicode)
5266 return NULL;
5267 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005268 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005269 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005270
Walter Dörwald41980ca2007-08-16 21:55:45 +00005271 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 Py_UCS4 ch;
5273 /* remaining bytes at the end? (size should be divisible by 4) */
5274 if (e-q<4) {
5275 if (consumed)
5276 break;
5277 errmsg = "truncated data";
5278 startinpos = ((const char *)q)-starts;
5279 endinpos = ((const char *)e)-starts;
5280 goto utf32Error;
5281 /* The remaining input chars are ignored if the callback
5282 chooses to skip the input */
5283 }
5284 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5285 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005286
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 if (ch >= 0x110000)
5288 {
5289 errmsg = "codepoint not in range(0x110000)";
5290 startinpos = ((const char *)q)-starts;
5291 endinpos = startinpos+4;
5292 goto utf32Error;
5293 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005294 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5295 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 q += 4;
5297 continue;
5298 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 if (unicode_decode_call_errorhandler(
5300 errors, &errorHandler,
5301 "utf32", errmsg,
5302 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005303 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005305 }
5306
5307 if (byteorder)
5308 *byteorder = bo;
5309
5310 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312
5313 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005314 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005315 goto onError;
5316
5317 Py_XDECREF(errorHandler);
5318 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005319 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005320
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005322 Py_DECREF(unicode);
5323 Py_XDECREF(errorHandler);
5324 Py_XDECREF(exc);
5325 return NULL;
5326}
5327
5328PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005329_PyUnicode_EncodeUTF32(PyObject *str,
5330 const char *errors,
5331 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005332{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005333 int kind;
5334 void *data;
5335 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005336 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005337 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005338 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005339 /* Offsets from p for storing byte pairs in the right order. */
5340#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5341 int iorder[] = {0, 1, 2, 3};
5342#else
5343 int iorder[] = {3, 2, 1, 0};
5344#endif
5345
Benjamin Peterson29060642009-01-31 22:14:21 +00005346#define STORECHAR(CH) \
5347 do { \
5348 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5349 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5350 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5351 p[iorder[0]] = (CH) & 0xff; \
5352 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005353 } while(0)
5354
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005355 if (!PyUnicode_Check(str)) {
5356 PyErr_BadArgument();
5357 return NULL;
5358 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005359 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005360 return NULL;
5361 kind = PyUnicode_KIND(str);
5362 data = PyUnicode_DATA(str);
5363 len = PyUnicode_GET_LENGTH(str);
5364
5365 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005366 bytesize = nsize * 4;
5367 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005369 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370 if (v == NULL)
5371 return NULL;
5372
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005373 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005376 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005377 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378
5379 if (byteorder == -1) {
5380 /* force LE */
5381 iorder[0] = 0;
5382 iorder[1] = 1;
5383 iorder[2] = 2;
5384 iorder[3] = 3;
5385 }
5386 else if (byteorder == 1) {
5387 /* force BE */
5388 iorder[0] = 3;
5389 iorder[1] = 2;
5390 iorder[2] = 1;
5391 iorder[3] = 0;
5392 }
5393
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005394 for (i = 0; i < len; i++)
5395 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005396
5397 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005398 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399#undef STORECHAR
5400}
5401
Alexander Belopolsky40018472011-02-26 01:02:56 +00005402PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005403PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5404 Py_ssize_t size,
5405 const char *errors,
5406 int byteorder)
5407{
5408 PyObject *result;
5409 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5410 if (tmp == NULL)
5411 return NULL;
5412 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5413 Py_DECREF(tmp);
5414 return result;
5415}
5416
5417PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005418PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419{
Victor Stinnerb960b342011-11-20 19:12:52 +01005420 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421}
5422
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423/* --- UTF-16 Codec ------------------------------------------------------- */
5424
Tim Peters772747b2001-08-09 22:21:55 +00005425PyObject *
5426PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 Py_ssize_t size,
5428 const char *errors,
5429 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430{
Walter Dörwald69652032004-09-07 20:24:22 +00005431 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5432}
5433
Antoine Pitrouab868312009-01-10 15:40:25 +00005434/* Two masks for fast checking of whether a C 'long' may contain
5435 UTF16-encoded surrogate characters. This is an efficient heuristic,
5436 assuming that non-surrogate characters with a code point >= 0x8000 are
5437 rare in most input.
5438 FAST_CHAR_MASK is used when the input is in native byte ordering,
5439 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005440*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005441#if (SIZEOF_LONG == 8)
5442# define FAST_CHAR_MASK 0x8000800080008000L
5443# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005444# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005445#elif (SIZEOF_LONG == 4)
5446# define FAST_CHAR_MASK 0x80008000L
5447# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005448# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005449#else
5450# error C 'long' size should be either 4 or 8!
5451#endif
5452
Walter Dörwald69652032004-09-07 20:24:22 +00005453PyObject *
5454PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 Py_ssize_t size,
5456 const char *errors,
5457 int *byteorder,
5458 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005459{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005461 Py_ssize_t startinpos;
5462 Py_ssize_t endinpos;
5463 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005464 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005465 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005466 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005467 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005468 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005469 /* Offsets from q for retrieving byte pairs in the right order. */
5470#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5471 int ihi = 1, ilo = 0;
5472#else
5473 int ihi = 0, ilo = 1;
5474#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 PyObject *errorHandler = NULL;
5476 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
5478 /* Note: size will always be longer than the resulting Unicode
5479 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005480 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 if (!unicode)
5482 return NULL;
5483 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005484 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005485 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486
Tim Peters772747b2001-08-09 22:21:55 +00005487 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005488 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
5490 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005491 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005493 /* Check for BOM marks (U+FEFF) in the input and adjust current
5494 byte order setting accordingly. In native mode, the leading BOM
5495 mark is skipped, in all other modes, it is copied to the output
5496 stream as-is (giving a ZWNBSP character). */
5497 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005498 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005499 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005500#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 if (bom == 0xFEFF) {
5502 q += 2;
5503 bo = -1;
5504 }
5505 else if (bom == 0xFFFE) {
5506 q += 2;
5507 bo = 1;
5508 }
Tim Petersced69f82003-09-16 20:30:58 +00005509#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 if (bom == 0xFEFF) {
5511 q += 2;
5512 bo = 1;
5513 }
5514 else if (bom == 0xFFFE) {
5515 q += 2;
5516 bo = -1;
5517 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005518#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521
Tim Peters772747b2001-08-09 22:21:55 +00005522 if (bo == -1) {
5523 /* force LE */
5524 ihi = 1;
5525 ilo = 0;
5526 }
5527 else if (bo == 1) {
5528 /* force BE */
5529 ihi = 0;
5530 ilo = 1;
5531 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005532#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5533 native_ordering = ilo < ihi;
5534#else
5535 native_ordering = ilo > ihi;
5536#endif
Tim Peters772747b2001-08-09 22:21:55 +00005537
Antoine Pitrouab868312009-01-10 15:40:25 +00005538 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005539 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005540 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005541 /* First check for possible aligned read of a C 'long'. Unaligned
5542 reads are more expensive, better to defer to another iteration. */
5543 if (!((size_t) q & LONG_PTR_MASK)) {
5544 /* Fast path for runs of non-surrogate chars. */
5545 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005546 int kind = PyUnicode_KIND(unicode);
5547 void *data = PyUnicode_DATA(unicode);
5548 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005549 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005550 Py_UCS4 maxch;
5551 if (native_ordering) {
5552 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005553 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005554 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005555 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005556 else {
5557 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005558 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005559 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005560 block = ((block >> 8) & STRIPPED_MASK) |
5561 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005562 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005563 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005564#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005565 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
5566 maxch = Py_MAX(maxch, ch);
5567 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
5568 maxch = Py_MAX(maxch, ch);
5569 ch = (Py_UCS2)(block >> 48);
5570 maxch = Py_MAX(maxch, ch);
5571#else
5572 ch = (Py_UCS2)(block >> 16);
5573 maxch = Py_MAX(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005574#endif
5575 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5576 if (unicode_widen(&unicode, maxch) < 0)
5577 goto onError;
5578 kind = PyUnicode_KIND(unicode);
5579 data = PyUnicode_DATA(unicode);
5580 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005581#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5582 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005583#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005584 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5585 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5586 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5587#else
5588 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5589#endif
5590#else
5591#if SIZEOF_LONG == 8
5592 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5593 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5594 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5595#else
5596 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5597#endif
5598 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005599#endif
5600 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005601 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005602 q = _q;
5603 if (q >= e)
5604 break;
5605 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607
Benjamin Peterson14339b62009-01-31 16:36:08 +00005608 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005609
Victor Stinner551ac952011-11-29 22:58:13 +01005610 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005611 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5612 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 continue;
5614 }
5615
5616 /* UTF-16 code pair: */
5617 if (q > e) {
5618 errmsg = "unexpected end of data";
5619 startinpos = (((const char *)q) - 2) - starts;
5620 endinpos = ((const char *)e) + 1 - starts;
5621 goto utf16Error;
5622 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005623 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5624 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005626 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005627 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005628 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005629 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 continue;
5631 }
5632 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005633 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 startinpos = (((const char *)q)-4)-starts;
5635 endinpos = startinpos+2;
5636 goto utf16Error;
5637 }
5638
Benjamin Peterson14339b62009-01-31 16:36:08 +00005639 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 errmsg = "illegal encoding";
5641 startinpos = (((const char *)q)-2)-starts;
5642 endinpos = startinpos+2;
5643 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005644
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005647 errors,
5648 &errorHandler,
5649 "utf16", errmsg,
5650 &starts,
5651 (const char **)&e,
5652 &startinpos,
5653 &endinpos,
5654 &exc,
5655 (const char **)&q,
5656 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005657 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005660 /* remaining byte at the end? (size should be even) */
5661 if (e == q) {
5662 if (!consumed) {
5663 errmsg = "truncated data";
5664 startinpos = ((const char *)q) - starts;
5665 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005666 if (unicode_decode_call_errorhandler(
5667 errors,
5668 &errorHandler,
5669 "utf16", errmsg,
5670 &starts,
5671 (const char **)&e,
5672 &startinpos,
5673 &endinpos,
5674 &exc,
5675 (const char **)&q,
5676 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005677 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005678 goto onError;
5679 /* The remaining input chars are ignored if the callback
5680 chooses to skip the input */
5681 }
5682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683
5684 if (byteorder)
5685 *byteorder = bo;
5686
Walter Dörwald69652032004-09-07 20:24:22 +00005687 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005689
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005691 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 goto onError;
5693
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 Py_XDECREF(errorHandler);
5695 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005696 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005700 Py_XDECREF(errorHandler);
5701 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 return NULL;
5703}
5704
Antoine Pitrouab868312009-01-10 15:40:25 +00005705#undef FAST_CHAR_MASK
5706#undef SWAPPED_FAST_CHAR_MASK
5707
Tim Peters772747b2001-08-09 22:21:55 +00005708PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005709_PyUnicode_EncodeUTF16(PyObject *str,
5710 const char *errors,
5711 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005713 int kind;
5714 void *data;
5715 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005716 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005717 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005718 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005719 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005720 /* Offsets from p for storing byte pairs in the right order. */
5721#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5722 int ihi = 1, ilo = 0;
5723#else
5724 int ihi = 0, ilo = 1;
5725#endif
5726
Benjamin Peterson29060642009-01-31 22:14:21 +00005727#define STORECHAR(CH) \
5728 do { \
5729 p[ihi] = ((CH) >> 8) & 0xff; \
5730 p[ilo] = (CH) & 0xff; \
5731 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005732 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005734 if (!PyUnicode_Check(str)) {
5735 PyErr_BadArgument();
5736 return NULL;
5737 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005738 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005739 return NULL;
5740 kind = PyUnicode_KIND(str);
5741 data = PyUnicode_DATA(str);
5742 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005743
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005744 pairs = 0;
5745 if (kind == PyUnicode_4BYTE_KIND)
5746 for (i = 0; i < len; i++)
5747 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5748 pairs++;
5749 /* 2 * (len + pairs + (byteorder == 0)) */
5750 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005753 bytesize = nsize * 2;
5754 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005756 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 if (v == NULL)
5758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005760 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005763 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005764 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005765
5766 if (byteorder == -1) {
5767 /* force LE */
5768 ihi = 1;
5769 ilo = 0;
5770 }
5771 else if (byteorder == 1) {
5772 /* force BE */
5773 ihi = 0;
5774 ilo = 1;
5775 }
5776
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005777 for (i = 0; i < len; i++) {
5778 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5779 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005781 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5782 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 }
Tim Peters772747b2001-08-09 22:21:55 +00005784 STORECHAR(ch);
5785 if (ch2)
5786 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005787 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005788
5789 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005790 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005791#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792}
5793
Alexander Belopolsky40018472011-02-26 01:02:56 +00005794PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005795PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5796 Py_ssize_t size,
5797 const char *errors,
5798 int byteorder)
5799{
5800 PyObject *result;
5801 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5802 if (tmp == NULL)
5803 return NULL;
5804 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5805 Py_DECREF(tmp);
5806 return result;
5807}
5808
5809PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005810PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005812 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813}
5814
5815/* --- Unicode Escape Codec ----------------------------------------------- */
5816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005817/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5818 if all the escapes in the string make it still a valid ASCII string.
5819 Returns -1 if any escapes were found which cause the string to
5820 pop out of ASCII range. Otherwise returns the length of the
5821 required buffer to hold the string.
5822 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005823static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5825{
5826 const unsigned char *p = (const unsigned char *)s;
5827 const unsigned char *end = p + size;
5828 Py_ssize_t length = 0;
5829
5830 if (size < 0)
5831 return -1;
5832
5833 for (; p < end; ++p) {
5834 if (*p > 127) {
5835 /* Non-ASCII */
5836 return -1;
5837 }
5838 else if (*p != '\\') {
5839 /* Normal character */
5840 ++length;
5841 }
5842 else {
5843 /* Backslash-escape, check next char */
5844 ++p;
5845 /* Escape sequence reaches till end of string or
5846 non-ASCII follow-up. */
5847 if (p >= end || *p > 127)
5848 return -1;
5849 switch (*p) {
5850 case '\n':
5851 /* backslash + \n result in zero characters */
5852 break;
5853 case '\\': case '\'': case '\"':
5854 case 'b': case 'f': case 't':
5855 case 'n': case 'r': case 'v': case 'a':
5856 ++length;
5857 break;
5858 case '0': case '1': case '2': case '3':
5859 case '4': case '5': case '6': case '7':
5860 case 'x': case 'u': case 'U': case 'N':
5861 /* these do not guarantee ASCII characters */
5862 return -1;
5863 default:
5864 /* count the backslash + the other character */
5865 length += 2;
5866 }
5867 }
5868 }
5869 return length;
5870}
5871
Fredrik Lundh06d12682001-01-24 07:59:11 +00005872static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005873
Alexander Belopolsky40018472011-02-26 01:02:56 +00005874PyObject *
5875PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005876 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005877 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005880 Py_ssize_t startinpos;
5881 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005882 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005883 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005885 char* message;
5886 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887 PyObject *errorHandler = NULL;
5888 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005889 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005890 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005891
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005893
5894 /* After length_of_escaped_ascii_string() there are two alternatives,
5895 either the string is pure ASCII with named escapes like \n, etc.
5896 and we determined it's exact size (common case)
5897 or it contains \x, \u, ... escape sequences. then we create a
5898 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005899 if (len >= 0) {
5900 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005901 if (!v)
5902 goto onError;
5903 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005904 }
5905 else {
5906 /* Escaped strings will always be longer than the resulting
5907 Unicode string, so we start with size here and then reduce the
5908 length after conversion to the true value.
5909 (but if the error callback returns a long replacement string
5910 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005911 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005912 if (!v)
5913 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005914 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005915 }
5916
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005918 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005919 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005921
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 while (s < end) {
5923 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005924 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005927 /* The only case in which i == ascii_length is a backslash
5928 followed by a newline. */
5929 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005930
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 /* Non-escape characters are interpreted as Unicode ordinals */
5932 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005933 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5934 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 continue;
5936 }
5937
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 /* \ - Escapes */
5940 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005941 c = *s++;
5942 if (s > end)
5943 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005944
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005945 /* The only case in which i == ascii_length is a backslash
5946 followed by a newline. */
5947 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005949 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952#define WRITECHAR(ch) \
5953 do { \
5954 if (unicode_putchar(&v, &i, ch) < 0) \
5955 goto onError; \
5956 }while(0)
5957
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005959 case '\\': WRITECHAR('\\'); break;
5960 case '\'': WRITECHAR('\''); break;
5961 case '\"': WRITECHAR('\"'); break;
5962 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005963 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005964 case 'f': WRITECHAR('\014'); break;
5965 case 't': WRITECHAR('\t'); break;
5966 case 'n': WRITECHAR('\n'); break;
5967 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005969 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005970 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005971 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 case '0': case '1': case '2': case '3':
5975 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005976 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005977 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005978 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005979 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005980 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005982 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 break;
5984
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 /* hex escapes */
5986 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005988 digits = 2;
5989 message = "truncated \\xXX escape";
5990 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005994 digits = 4;
5995 message = "truncated \\uXXXX escape";
5996 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005999 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006000 digits = 8;
6001 message = "truncated \\UXXXXXXXX escape";
6002 hexescape:
6003 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 if (s+digits>end) {
6005 endinpos = size;
6006 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 errors, &errorHandler,
6008 "unicodeescape", "end of string in escape sequence",
6009 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006010 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 goto onError;
6012 goto nextByte;
6013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006014 for (j = 0; j < digits; ++j) {
6015 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006016 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006017 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 errors, &errorHandler,
6020 "unicodeescape", message,
6021 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006022 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006023 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006024 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006025 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006026 }
6027 chr = (chr<<4) & ~0xF;
6028 if (c >= '0' && c <= '9')
6029 chr += c - '0';
6030 else if (c >= 'a' && c <= 'f')
6031 chr += 10 + c - 'a';
6032 else
6033 chr += 10 + c - 'A';
6034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006035 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006036 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 /* _decoding_error will have already written into the
6038 target buffer. */
6039 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006040 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006041 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006042 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006043 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006044 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 errors, &errorHandler,
6048 "unicodeescape", "illegal Unicode character",
6049 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006050 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006051 goto onError;
6052 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006053 break;
6054
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006056 case 'N':
6057 message = "malformed \\N character escape";
6058 if (ucnhash_CAPI == NULL) {
6059 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006060 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6061 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 if (ucnhash_CAPI == NULL)
6063 goto ucnhashError;
6064 }
6065 if (*s == '{') {
6066 const char *start = s+1;
6067 /* look for the closing brace */
6068 while (*s != '}' && s < end)
6069 s++;
6070 if (s > start && s < end && *s == '}') {
6071 /* found a name. look it up in the unicode database */
6072 message = "unknown Unicode character name";
6073 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006074 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006075 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076 goto store;
6077 }
6078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 errors, &errorHandler,
6082 "unicodeescape", message,
6083 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006084 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006085 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006086 break;
6087
6088 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006089 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006090 message = "\\ at end of string";
6091 s--;
6092 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 errors, &errorHandler,
6095 "unicodeescape", message,
6096 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006097 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006098 goto onError;
6099 }
6100 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006101 WRITECHAR('\\');
6102 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006103 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006104 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006109#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006110
Victor Stinner16e6a802011-12-12 13:24:15 +01006111 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006112 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006113 Py_XDECREF(errorHandler);
6114 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006115 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006116
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006118 PyErr_SetString(
6119 PyExc_UnicodeError,
6120 "\\N escapes not supported (can't load unicodedata module)"
6121 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006122 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 Py_XDECREF(errorHandler);
6124 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006125 return NULL;
6126
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 Py_XDECREF(errorHandler);
6130 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 return NULL;
6132}
6133
6134/* Return a Unicode-Escape string version of the Unicode object.
6135
6136 If quotes is true, the string is enclosed in u"" or u'' quotes as
6137 appropriate.
6138
6139*/
6140
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006145 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 int kind;
6148 void *data;
6149 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Thomas Wouters89f507f2006-12-13 04:49:30 +00006151 /* Initial allocation is based on the longest-possible unichr
6152 escape.
6153
6154 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6155 unichr, so in this case it's the longest unichr escape. In
6156 narrow (UTF-16) builds this is five chars per source unichr
6157 since there are two unichrs in the surrogate pair, so in narrow
6158 (UTF-16) builds it's not the longest unichr escape.
6159
6160 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6161 so in the narrow (UTF-16) build case it's the longest unichr
6162 escape.
6163 */
6164
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 if (!PyUnicode_Check(unicode)) {
6166 PyErr_BadArgument();
6167 return NULL;
6168 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006169 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 return NULL;
6171 len = PyUnicode_GET_LENGTH(unicode);
6172 kind = PyUnicode_KIND(unicode);
6173 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006174 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6176 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6177 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6178 }
6179
6180 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006181 return PyBytes_FromStringAndSize(NULL, 0);
6182
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006185
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 if (repr == NULL)
6191 return NULL;
6192
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006193 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006196 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006197
Walter Dörwald79e913e2007-05-12 11:08:06 +00006198 /* Escape backslashes */
6199 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 *p++ = '\\';
6201 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006202 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006203 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006204
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006205 /* Map 21-bit characters to '\U00xxxxxx' */
6206 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006207 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006208 *p++ = '\\';
6209 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006210 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6211 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6212 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6213 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6214 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6215 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6216 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6217 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006219 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006220
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006222 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 *p++ = '\\';
6224 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006225 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6226 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6227 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6228 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006230
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006231 /* Map special whitespace to '\t', \n', '\r' */
6232 else if (ch == '\t') {
6233 *p++ = '\\';
6234 *p++ = 't';
6235 }
6236 else if (ch == '\n') {
6237 *p++ = '\\';
6238 *p++ = 'n';
6239 }
6240 else if (ch == '\r') {
6241 *p++ = '\\';
6242 *p++ = 'r';
6243 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006244
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006245 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006246 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006248 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006249 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6250 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006251 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006252
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 /* Copy everything else as-is */
6254 else
6255 *p++ = (char) ch;
6256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006258 assert(p - PyBytes_AS_STRING(repr) > 0);
6259 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6260 return NULL;
6261 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262}
6263
Alexander Belopolsky40018472011-02-26 01:02:56 +00006264PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006265PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6266 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268 PyObject *result;
6269 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6270 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006272 result = PyUnicode_AsUnicodeEscapeString(tmp);
6273 Py_DECREF(tmp);
6274 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275}
6276
6277/* --- Raw Unicode Escape Codec ------------------------------------------- */
6278
Alexander Belopolsky40018472011-02-26 01:02:56 +00006279PyObject *
6280PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006281 Py_ssize_t size,
6282 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006284 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006285 Py_ssize_t startinpos;
6286 Py_ssize_t endinpos;
6287 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006288 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 const char *end;
6290 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 PyObject *errorHandler = NULL;
6292 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006293
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 /* Escaped strings will always be longer than the resulting
6295 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 length after conversion to the true value. (But decoding error
6297 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006298 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006302 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006303 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 end = s + size;
6305 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 unsigned char c;
6307 Py_UCS4 x;
6308 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006309 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 /* Non-escape characters are interpreted as Unicode ordinals */
6312 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006313 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6314 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 startinpos = s-starts;
6318
6319 /* \u-escapes are only interpreted iff the number of leading
6320 backslashes if odd */
6321 bs = s;
6322 for (;s < end;) {
6323 if (*s != '\\')
6324 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006325 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6326 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 }
6328 if (((s - bs) & 1) == 0 ||
6329 s >= end ||
6330 (*s != 'u' && *s != 'U')) {
6331 continue;
6332 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006333 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 count = *s=='u' ? 4 : 8;
6335 s++;
6336
6337 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 for (x = 0, i = 0; i < count; ++i, ++s) {
6339 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006340 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 endinpos = s-starts;
6342 if (unicode_decode_call_errorhandler(
6343 errors, &errorHandler,
6344 "rawunicodeescape", "truncated \\uXXXX",
6345 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006346 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 goto onError;
6348 goto nextByte;
6349 }
6350 x = (x<<4) & ~0xF;
6351 if (c >= '0' && c <= '9')
6352 x += c - '0';
6353 else if (c >= 'a' && c <= 'f')
6354 x += 10 + c - 'a';
6355 else
6356 x += 10 + c - 'A';
6357 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006358 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006359 if (unicode_putchar(&v, &outpos, x) < 0)
6360 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006361 } else {
6362 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006363 if (unicode_decode_call_errorhandler(
6364 errors, &errorHandler,
6365 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006367 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006369 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 nextByte:
6371 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006373 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 Py_XDECREF(errorHandler);
6376 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006377 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006378
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381 Py_XDECREF(errorHandler);
6382 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 return NULL;
6384}
6385
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006386
Alexander Belopolsky40018472011-02-26 01:02:56 +00006387PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006388PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006390 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 char *p;
6392 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006393 Py_ssize_t expandsize, pos;
6394 int kind;
6395 void *data;
6396 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006398 if (!PyUnicode_Check(unicode)) {
6399 PyErr_BadArgument();
6400 return NULL;
6401 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006402 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403 return NULL;
6404 kind = PyUnicode_KIND(unicode);
6405 data = PyUnicode_DATA(unicode);
6406 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006407 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6408 bytes, and 1 byte characters 4. */
6409 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006410
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006413
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006414 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 if (repr == NULL)
6416 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006417 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006418 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006420 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 for (pos = 0; pos < len; pos++) {
6422 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 /* Map 32-bit characters to '\Uxxxxxxxx' */
6424 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006425 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006426 *p++ = '\\';
6427 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006428 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6429 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6430 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6431 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6432 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6433 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6434 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6435 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006438 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 *p++ = '\\';
6440 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006441 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6442 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6443 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6444 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 /* Copy everything else as-is */
6447 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 *p++ = (char) ch;
6449 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006450
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006451 assert(p > q);
6452 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006453 return NULL;
6454 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455}
6456
Alexander Belopolsky40018472011-02-26 01:02:56 +00006457PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006458PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6459 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006461 PyObject *result;
6462 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6463 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006464 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006465 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6466 Py_DECREF(tmp);
6467 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468}
6469
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006470/* --- Unicode Internal Codec ------------------------------------------- */
6471
Alexander Belopolsky40018472011-02-26 01:02:56 +00006472PyObject *
6473_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006474 Py_ssize_t size,
6475 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006476{
6477 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006478 Py_ssize_t startinpos;
6479 Py_ssize_t endinpos;
6480 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006481 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006482 const char *end;
6483 const char *reason;
6484 PyObject *errorHandler = NULL;
6485 PyObject *exc = NULL;
6486
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006487 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006488 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006489 1))
6490 return NULL;
6491
Thomas Wouters89f507f2006-12-13 04:49:30 +00006492 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006493 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006494 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006496 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006497 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006498 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006499 end = s + size;
6500
6501 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006502 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006503 Py_UCS4 ch;
6504 /* We copy the raw representation one byte at a time because the
6505 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006506 ((char *) &uch)[0] = s[0];
6507 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006508#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006509 ((char *) &uch)[2] = s[2];
6510 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006511#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006512 ch = uch;
6513
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006514 /* We have to sanity check the raw data, otherwise doom looms for
6515 some malformed UCS-4 data. */
6516 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006517#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006518 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006519#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006520 end-s < Py_UNICODE_SIZE
6521 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006523 startinpos = s - starts;
6524 if (end-s < Py_UNICODE_SIZE) {
6525 endinpos = end-starts;
6526 reason = "truncated input";
6527 }
6528 else {
6529 endinpos = s - starts + Py_UNICODE_SIZE;
6530 reason = "illegal code point (> 0x10FFFF)";
6531 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006532 if (unicode_decode_call_errorhandler(
6533 errors, &errorHandler,
6534 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006535 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006536 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006538 continue;
6539 }
6540
6541 s += Py_UNICODE_SIZE;
6542#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006543 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006544 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006545 Py_UNICODE uch2;
6546 ((char *) &uch2)[0] = s[0];
6547 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006548 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006549 {
Victor Stinner551ac952011-11-29 22:58:13 +01006550 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006551 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006552 }
6553 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006554#endif
6555
6556 if (unicode_putchar(&v, &outpos, ch) < 0)
6557 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006558 }
6559
Victor Stinner16e6a802011-12-12 13:24:15 +01006560 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006561 goto onError;
6562 Py_XDECREF(errorHandler);
6563 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006564 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006565
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006567 Py_XDECREF(v);
6568 Py_XDECREF(errorHandler);
6569 Py_XDECREF(exc);
6570 return NULL;
6571}
6572
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573/* --- Latin-1 Codec ------------------------------------------------------ */
6574
Alexander Belopolsky40018472011-02-26 01:02:56 +00006575PyObject *
6576PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006577 Py_ssize_t size,
6578 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006581 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582}
6583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006584/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006585static void
6586make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006587 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006588 PyObject *unicode,
6589 Py_ssize_t startpos, Py_ssize_t endpos,
6590 const char *reason)
6591{
6592 if (*exceptionObject == NULL) {
6593 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006594 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006595 encoding, unicode, startpos, endpos, reason);
6596 }
6597 else {
6598 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6599 goto onError;
6600 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6601 goto onError;
6602 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6603 goto onError;
6604 return;
6605 onError:
6606 Py_DECREF(*exceptionObject);
6607 *exceptionObject = NULL;
6608 }
6609}
6610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612static void
6613raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006614 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006615 PyObject *unicode,
6616 Py_ssize_t startpos, Py_ssize_t endpos,
6617 const char *reason)
6618{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006619 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006620 encoding, unicode, startpos, endpos, reason);
6621 if (*exceptionObject != NULL)
6622 PyCodec_StrictErrors(*exceptionObject);
6623}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006624
6625/* error handling callback helper:
6626 build arguments, call the callback and check the arguments,
6627 put the result into newpos and return the replacement string, which
6628 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006629static PyObject *
6630unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006631 PyObject **errorHandler,
6632 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006634 Py_ssize_t startpos, Py_ssize_t endpos,
6635 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006637 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006638 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 PyObject *restuple;
6640 PyObject *resunicode;
6641
6642 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 }
6647
Benjamin Petersonbac79492012-01-14 13:34:47 -05006648 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 return NULL;
6650 len = PyUnicode_GET_LENGTH(unicode);
6651
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006652 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656
6657 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006659 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006662 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 Py_DECREF(restuple);
6664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006666 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 &resunicode, newpos)) {
6668 Py_DECREF(restuple);
6669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006671 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6672 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6673 Py_DECREF(restuple);
6674 return NULL;
6675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 *newpos = len + *newpos;
6678 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6680 Py_DECREF(restuple);
6681 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006682 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 Py_INCREF(resunicode);
6684 Py_DECREF(restuple);
6685 return resunicode;
6686}
6687
Alexander Belopolsky40018472011-02-26 01:02:56 +00006688static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006690 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006691 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006693 /* input state */
6694 Py_ssize_t pos=0, size;
6695 int kind;
6696 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 /* output object */
6698 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 /* pointer into the output */
6700 char *str;
6701 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006702 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006703 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6704 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 PyObject *errorHandler = NULL;
6706 PyObject *exc = NULL;
6707 /* the following variable is used for caching string comparisons
6708 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6709 int known_errorHandler = -1;
6710
Benjamin Petersonbac79492012-01-14 13:34:47 -05006711 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006712 return NULL;
6713 size = PyUnicode_GET_LENGTH(unicode);
6714 kind = PyUnicode_KIND(unicode);
6715 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716 /* allocate enough for a simple encoding without
6717 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006718 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006719 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006720 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006722 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006723 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 ressize = size;
6725
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 while (pos < size) {
6727 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 /* can we encode this? */
6730 if (c<limit) {
6731 /* no overflow check, because we know that the space is enough */
6732 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006734 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 Py_ssize_t requiredsize;
6737 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 Py_ssize_t collstart = pos;
6741 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006743 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 ++collend;
6745 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6746 if (known_errorHandler==-1) {
6747 if ((errors==NULL) || (!strcmp(errors, "strict")))
6748 known_errorHandler = 1;
6749 else if (!strcmp(errors, "replace"))
6750 known_errorHandler = 2;
6751 else if (!strcmp(errors, "ignore"))
6752 known_errorHandler = 3;
6753 else if (!strcmp(errors, "xmlcharrefreplace"))
6754 known_errorHandler = 4;
6755 else
6756 known_errorHandler = 0;
6757 }
6758 switch (known_errorHandler) {
6759 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006760 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 goto onError;
6762 case 2: /* replace */
6763 while (collstart++<collend)
6764 *str++ = '?'; /* fall through */
6765 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006766 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 break;
6768 case 4: /* xmlcharrefreplace */
6769 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006770 /* determine replacement size */
6771 for (i = collstart, repsize = 0; i < collend; ++i) {
6772 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6773 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006775 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006777 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006779 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006783 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006785 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006786 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006788 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006790 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 if (requiredsize > ressize) {
6792 if (requiredsize<2*ressize)
6793 requiredsize = 2*ressize;
6794 if (_PyBytes_Resize(&res, requiredsize))
6795 goto onError;
6796 str = PyBytes_AS_STRING(res) + respos;
6797 ressize = requiredsize;
6798 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006799 /* generate replacement */
6800 for (i = collstart; i < collend; ++i) {
6801 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006803 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 break;
6805 default:
6806 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006807 encoding, reason, unicode, &exc,
6808 collstart, collend, &newpos);
6809 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006810 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006812 if (PyBytes_Check(repunicode)) {
6813 /* Directly copy bytes result to output. */
6814 repsize = PyBytes_Size(repunicode);
6815 if (repsize > 1) {
6816 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006817 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006818 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6819 Py_DECREF(repunicode);
6820 goto onError;
6821 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006822 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006823 ressize += repsize-1;
6824 }
6825 memcpy(str, PyBytes_AsString(repunicode), repsize);
6826 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006827 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006828 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006829 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006830 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 /* need more space? (at least enough for what we
6832 have+the replacement+the rest of the string, so
6833 we won't have to check space for encodable characters) */
6834 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006835 repsize = PyUnicode_GET_LENGTH(repunicode);
6836 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 if (requiredsize > ressize) {
6838 if (requiredsize<2*ressize)
6839 requiredsize = 2*ressize;
6840 if (_PyBytes_Resize(&res, requiredsize)) {
6841 Py_DECREF(repunicode);
6842 goto onError;
6843 }
6844 str = PyBytes_AS_STRING(res) + respos;
6845 ressize = requiredsize;
6846 }
6847 /* check if there is anything unencodable in the replacement
6848 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006849 for (i = 0; repsize-->0; ++i, ++str) {
6850 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006852 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006853 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 Py_DECREF(repunicode);
6855 goto onError;
6856 }
6857 *str = (char)c;
6858 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006859 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006860 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006861 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006862 }
6863 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006864 /* Resize if we allocated to much */
6865 size = str - PyBytes_AS_STRING(res);
6866 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006867 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006868 if (_PyBytes_Resize(&res, size) < 0)
6869 goto onError;
6870 }
6871
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 Py_XDECREF(errorHandler);
6873 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006874 return res;
6875
6876 onError:
6877 Py_XDECREF(res);
6878 Py_XDECREF(errorHandler);
6879 Py_XDECREF(exc);
6880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881}
6882
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006883/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884PyObject *
6885PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006886 Py_ssize_t size,
6887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006889 PyObject *result;
6890 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6891 if (unicode == NULL)
6892 return NULL;
6893 result = unicode_encode_ucs1(unicode, errors, 256);
6894 Py_DECREF(unicode);
6895 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Alexander Belopolsky40018472011-02-26 01:02:56 +00006898PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006899_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
6901 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 PyErr_BadArgument();
6903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905 if (PyUnicode_READY(unicode) == -1)
6906 return NULL;
6907 /* Fast path: if it is a one-byte string, construct
6908 bytes object directly. */
6909 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6910 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6911 PyUnicode_GET_LENGTH(unicode));
6912 /* Non-Latin-1 characters present. Defer to above function to
6913 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006915}
6916
6917PyObject*
6918PyUnicode_AsLatin1String(PyObject *unicode)
6919{
6920 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
6923/* --- 7-bit ASCII Codec -------------------------------------------------- */
6924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925PyObject *
6926PyUnicode_DecodeASCII(const char *s,
6927 Py_ssize_t size,
6928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006931 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006932 int kind;
6933 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006934 Py_ssize_t startinpos;
6935 Py_ssize_t endinpos;
6936 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006938 int has_error;
6939 const unsigned char *p = (const unsigned char *)s;
6940 const unsigned char *end = p + size;
6941 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006942 PyObject *errorHandler = NULL;
6943 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006944
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006945 if (size == 0) {
6946 Py_INCREF(unicode_empty);
6947 return unicode_empty;
6948 }
6949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006951 if (size == 1 && (unsigned char)s[0] < 128)
6952 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006953
Victor Stinner702c7342011-10-05 13:50:52 +02006954 has_error = 0;
6955 while (p < end && !has_error) {
6956 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6957 an explanation. */
6958 if (!((size_t) p & LONG_PTR_MASK)) {
6959 /* Help register allocation */
6960 register const unsigned char *_p = p;
6961 while (_p < aligned_end) {
6962 unsigned long value = *(unsigned long *) _p;
6963 if (value & ASCII_CHAR_MASK) {
6964 has_error = 1;
6965 break;
6966 }
6967 _p += SIZEOF_LONG;
6968 }
6969 if (_p == end)
6970 break;
6971 if (has_error)
6972 break;
6973 p = _p;
6974 }
6975 if (*p & 0x80) {
6976 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006977 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006978 }
6979 else {
6980 ++p;
6981 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006982 }
Victor Stinner702c7342011-10-05 13:50:52 +02006983 if (!has_error)
6984 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006985
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006986 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006990 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006991 kind = PyUnicode_KIND(v);
6992 data = PyUnicode_DATA(v);
6993 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994 e = s + size;
6995 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006996 register unsigned char c = (unsigned char)*s;
6997 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006998 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 ++s;
7000 }
7001 else {
7002 startinpos = s-starts;
7003 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 if (unicode_decode_call_errorhandler(
7005 errors, &errorHandler,
7006 "ascii", "ordinal not in range(128)",
7007 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007008 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007010 kind = PyUnicode_KIND(v);
7011 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007014 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007015 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007016 Py_XDECREF(errorHandler);
7017 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007018 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007019 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007023 Py_XDECREF(errorHandler);
7024 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 return NULL;
7026}
7027
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007028/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007029PyObject *
7030PyUnicode_EncodeASCII(const Py_UNICODE *p,
7031 Py_ssize_t size,
7032 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007034 PyObject *result;
7035 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7036 if (unicode == NULL)
7037 return NULL;
7038 result = unicode_encode_ucs1(unicode, errors, 128);
7039 Py_DECREF(unicode);
7040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041}
7042
Alexander Belopolsky40018472011-02-26 01:02:56 +00007043PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007044_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045{
7046 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyErr_BadArgument();
7048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007050 if (PyUnicode_READY(unicode) == -1)
7051 return NULL;
7052 /* Fast path: if it is an ASCII-only string, construct bytes object
7053 directly. Else defer to above function to raise the exception. */
7054 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7055 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7056 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007057 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007058}
7059
7060PyObject *
7061PyUnicode_AsASCIIString(PyObject *unicode)
7062{
7063 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Victor Stinner99b95382011-07-04 14:23:54 +02007066#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007067
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007068/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007069
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007070#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071#define NEED_RETRY
7072#endif
7073
Victor Stinner3a50e702011-10-18 21:21:00 +02007074#ifndef WC_ERR_INVALID_CHARS
7075# define WC_ERR_INVALID_CHARS 0x0080
7076#endif
7077
7078static char*
7079code_page_name(UINT code_page, PyObject **obj)
7080{
7081 *obj = NULL;
7082 if (code_page == CP_ACP)
7083 return "mbcs";
7084 if (code_page == CP_UTF7)
7085 return "CP_UTF7";
7086 if (code_page == CP_UTF8)
7087 return "CP_UTF8";
7088
7089 *obj = PyBytes_FromFormat("cp%u", code_page);
7090 if (*obj == NULL)
7091 return NULL;
7092 return PyBytes_AS_STRING(*obj);
7093}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094
Alexander Belopolsky40018472011-02-26 01:02:56 +00007095static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007096is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097{
7098 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 if (!IsDBCSLeadByteEx(code_page, *curr))
7102 return 0;
7103
7104 prev = CharPrevExA(code_page, s, curr, 0);
7105 if (prev == curr)
7106 return 1;
7107 /* FIXME: This code is limited to "true" double-byte encodings,
7108 as it assumes an incomplete character consists of a single
7109 byte. */
7110 if (curr - prev == 2)
7111 return 1;
7112 if (!IsDBCSLeadByteEx(code_page, *prev))
7113 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114 return 0;
7115}
7116
Victor Stinner3a50e702011-10-18 21:21:00 +02007117static DWORD
7118decode_code_page_flags(UINT code_page)
7119{
7120 if (code_page == CP_UTF7) {
7121 /* The CP_UTF7 decoder only supports flags=0 */
7122 return 0;
7123 }
7124 else
7125 return MB_ERR_INVALID_CHARS;
7126}
7127
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 * Decode a byte string from a Windows code page into unicode object in strict
7130 * mode.
7131 *
7132 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7133 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007135static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007136decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007137 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 const char *in,
7139 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007140{
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007142 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144
7145 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 assert(insize > 0);
7147 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7148 if (outsize <= 0)
7149 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150
7151 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007153 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007154 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 if (*v == NULL)
7156 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007158 }
7159 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007160 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007162 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007165 }
7166
7167 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7169 if (outsize <= 0)
7170 goto error;
7171 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007172
Victor Stinner3a50e702011-10-18 21:21:00 +02007173error:
7174 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7175 return -2;
7176 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007177 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007178}
7179
Victor Stinner3a50e702011-10-18 21:21:00 +02007180/*
7181 * Decode a byte string from a code page into unicode object with an error
7182 * handler.
7183 *
7184 * Returns consumed size if succeed, or raise a WindowsError or
7185 * UnicodeDecodeError exception and returns -1 on error.
7186 */
7187static int
7188decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007189 PyObject **v,
7190 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 const char *errors)
7192{
7193 const char *startin = in;
7194 const char *endin = in + size;
7195 const DWORD flags = decode_code_page_flags(code_page);
7196 /* Ideally, we should get reason from FormatMessage. This is the Windows
7197 2000 English version of the message. */
7198 const char *reason = "No mapping for the Unicode character exists "
7199 "in the target code page.";
7200 /* each step cannot decode more than 1 character, but a character can be
7201 represented as a surrogate pair */
7202 wchar_t buffer[2], *startout, *out;
7203 int insize, outsize;
7204 PyObject *errorHandler = NULL;
7205 PyObject *exc = NULL;
7206 PyObject *encoding_obj = NULL;
7207 char *encoding;
7208 DWORD err;
7209 int ret = -1;
7210
7211 assert(size > 0);
7212
7213 encoding = code_page_name(code_page, &encoding_obj);
7214 if (encoding == NULL)
7215 return -1;
7216
7217 if (errors == NULL || strcmp(errors, "strict") == 0) {
7218 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7219 UnicodeDecodeError. */
7220 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7221 if (exc != NULL) {
7222 PyCodec_StrictErrors(exc);
7223 Py_CLEAR(exc);
7224 }
7225 goto error;
7226 }
7227
7228 if (*v == NULL) {
7229 /* Create unicode object */
7230 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7231 PyErr_NoMemory();
7232 goto error;
7233 }
Victor Stinnerab595942011-12-17 04:59:06 +01007234 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007235 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 if (*v == NULL)
7237 goto error;
7238 startout = PyUnicode_AS_UNICODE(*v);
7239 }
7240 else {
7241 /* Extend unicode object */
7242 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7243 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7244 PyErr_NoMemory();
7245 goto error;
7246 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007247 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 goto error;
7249 startout = PyUnicode_AS_UNICODE(*v) + n;
7250 }
7251
7252 /* Decode the byte string character per character */
7253 out = startout;
7254 while (in < endin)
7255 {
7256 /* Decode a character */
7257 insize = 1;
7258 do
7259 {
7260 outsize = MultiByteToWideChar(code_page, flags,
7261 in, insize,
7262 buffer, Py_ARRAY_LENGTH(buffer));
7263 if (outsize > 0)
7264 break;
7265 err = GetLastError();
7266 if (err != ERROR_NO_UNICODE_TRANSLATION
7267 && err != ERROR_INSUFFICIENT_BUFFER)
7268 {
7269 PyErr_SetFromWindowsErr(0);
7270 goto error;
7271 }
7272 insize++;
7273 }
7274 /* 4=maximum length of a UTF-8 sequence */
7275 while (insize <= 4 && (in + insize) <= endin);
7276
7277 if (outsize <= 0) {
7278 Py_ssize_t startinpos, endinpos, outpos;
7279
7280 startinpos = in - startin;
7281 endinpos = startinpos + 1;
7282 outpos = out - PyUnicode_AS_UNICODE(*v);
7283 if (unicode_decode_call_errorhandler(
7284 errors, &errorHandler,
7285 encoding, reason,
7286 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007287 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 {
7289 goto error;
7290 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007291 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007292 }
7293 else {
7294 in += insize;
7295 memcpy(out, buffer, outsize * sizeof(wchar_t));
7296 out += outsize;
7297 }
7298 }
7299
7300 /* write a NUL character at the end */
7301 *out = 0;
7302
7303 /* Extend unicode object */
7304 outsize = out - startout;
7305 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007306 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007308 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007309
7310error:
7311 Py_XDECREF(encoding_obj);
7312 Py_XDECREF(errorHandler);
7313 Py_XDECREF(exc);
7314 return ret;
7315}
7316
Victor Stinner3a50e702011-10-18 21:21:00 +02007317static PyObject *
7318decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 const char *s, Py_ssize_t size,
7320 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321{
Victor Stinner76a31a62011-11-04 00:05:13 +01007322 PyObject *v = NULL;
7323 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007324
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 if (code_page < 0) {
7326 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7327 return NULL;
7328 }
7329
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007330 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 do
7334 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007335#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 if (size > INT_MAX) {
7337 chunk_size = INT_MAX;
7338 final = 0;
7339 done = 0;
7340 }
7341 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007343 {
7344 chunk_size = (int)size;
7345 final = (consumed == NULL);
7346 done = 1;
7347 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007348
Victor Stinner76a31a62011-11-04 00:05:13 +01007349 /* Skip trailing lead-byte unless 'final' is set */
7350 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7351 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352
Victor Stinner76a31a62011-11-04 00:05:13 +01007353 if (chunk_size == 0 && done) {
7354 if (v != NULL)
7355 break;
7356 Py_INCREF(unicode_empty);
7357 return unicode_empty;
7358 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359
Victor Stinner76a31a62011-11-04 00:05:13 +01007360
7361 converted = decode_code_page_strict(code_page, &v,
7362 s, chunk_size);
7363 if (converted == -2)
7364 converted = decode_code_page_errors(code_page, &v,
7365 s, chunk_size,
7366 errors);
7367 assert(converted != 0);
7368
7369 if (converted < 0) {
7370 Py_XDECREF(v);
7371 return NULL;
7372 }
7373
7374 if (consumed)
7375 *consumed += converted;
7376
7377 s += converted;
7378 size -= converted;
7379 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007380
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007381 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007382}
7383
Alexander Belopolsky40018472011-02-26 01:02:56 +00007384PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007385PyUnicode_DecodeCodePageStateful(int code_page,
7386 const char *s,
7387 Py_ssize_t size,
7388 const char *errors,
7389 Py_ssize_t *consumed)
7390{
7391 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7392}
7393
7394PyObject *
7395PyUnicode_DecodeMBCSStateful(const char *s,
7396 Py_ssize_t size,
7397 const char *errors,
7398 Py_ssize_t *consumed)
7399{
7400 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7401}
7402
7403PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007404PyUnicode_DecodeMBCS(const char *s,
7405 Py_ssize_t size,
7406 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007407{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007408 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7409}
7410
Victor Stinner3a50e702011-10-18 21:21:00 +02007411static DWORD
7412encode_code_page_flags(UINT code_page, const char *errors)
7413{
7414 if (code_page == CP_UTF8) {
7415 if (winver.dwMajorVersion >= 6)
7416 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7417 and later */
7418 return WC_ERR_INVALID_CHARS;
7419 else
7420 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7421 return 0;
7422 }
7423 else if (code_page == CP_UTF7) {
7424 /* CP_UTF7 only supports flags=0 */
7425 return 0;
7426 }
7427 else {
7428 if (errors != NULL && strcmp(errors, "replace") == 0)
7429 return 0;
7430 else
7431 return WC_NO_BEST_FIT_CHARS;
7432 }
7433}
7434
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007435/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 * Encode a Unicode string to a Windows code page into a byte string in strict
7437 * mode.
7438 *
7439 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7440 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007441 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007442static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007443encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007444 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007446{
Victor Stinner554f3f02010-06-16 23:33:54 +00007447 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 BOOL *pusedDefaultChar = &usedDefaultChar;
7449 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007450 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007451 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 const DWORD flags = encode_code_page_flags(code_page, NULL);
7454 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007455 /* Create a substring so that we can get the UTF-16 representation
7456 of just the slice under consideration. */
7457 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458
Martin v. Löwis3d325192011-11-04 18:23:06 +01007459 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007460
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007462 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007464 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007465
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466 substring = PyUnicode_Substring(unicode, offset, offset+len);
7467 if (substring == NULL)
7468 return -1;
7469 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7470 if (p == NULL) {
7471 Py_DECREF(substring);
7472 return -1;
7473 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007474
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007475 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 outsize = WideCharToMultiByte(code_page, flags,
7477 p, size,
7478 NULL, 0,
7479 NULL, pusedDefaultChar);
7480 if (outsize <= 0)
7481 goto error;
7482 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 if (pusedDefaultChar && *pusedDefaultChar) {
7484 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007486 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007487
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 if (*outbytes == NULL) {
7492 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496 }
7497 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 const Py_ssize_t n = PyBytes_Size(*outbytes);
7500 if (outsize > PY_SSIZE_T_MAX - n) {
7501 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007505 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7506 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007508 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510 }
7511
7512 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 outsize = WideCharToMultiByte(code_page, flags,
7514 p, size,
7515 out, outsize,
7516 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007517 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 if (outsize <= 0)
7519 goto error;
7520 if (pusedDefaultChar && *pusedDefaultChar)
7521 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007522 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007523
Victor Stinner3a50e702011-10-18 21:21:00 +02007524error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007525 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7527 return -2;
7528 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007529 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007530}
7531
Victor Stinner3a50e702011-10-18 21:21:00 +02007532/*
7533 * Encode a Unicode string to a Windows code page into a byte string using a
7534 * error handler.
7535 *
7536 * Returns consumed characters if succeed, or raise a WindowsError and returns
7537 * -1 on other error.
7538 */
7539static int
7540encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007541 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007542 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007543{
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007545 Py_ssize_t pos = unicode_offset;
7546 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 /* Ideally, we should get reason from FormatMessage. This is the Windows
7548 2000 English version of the message. */
7549 const char *reason = "invalid character";
7550 /* 4=maximum length of a UTF-8 sequence */
7551 char buffer[4];
7552 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7553 Py_ssize_t outsize;
7554 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007555 PyObject *errorHandler = NULL;
7556 PyObject *exc = NULL;
7557 PyObject *encoding_obj = NULL;
7558 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007559 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 PyObject *rep;
7561 int ret = -1;
7562
7563 assert(insize > 0);
7564
7565 encoding = code_page_name(code_page, &encoding_obj);
7566 if (encoding == NULL)
7567 return -1;
7568
7569 if (errors == NULL || strcmp(errors, "strict") == 0) {
7570 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7571 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007572 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 if (exc != NULL) {
7574 PyCodec_StrictErrors(exc);
7575 Py_DECREF(exc);
7576 }
7577 Py_XDECREF(encoding_obj);
7578 return -1;
7579 }
7580
7581 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7582 pusedDefaultChar = &usedDefaultChar;
7583 else
7584 pusedDefaultChar = NULL;
7585
7586 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7587 PyErr_NoMemory();
7588 goto error;
7589 }
7590 outsize = insize * Py_ARRAY_LENGTH(buffer);
7591
7592 if (*outbytes == NULL) {
7593 /* Create string object */
7594 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7595 if (*outbytes == NULL)
7596 goto error;
7597 out = PyBytes_AS_STRING(*outbytes);
7598 }
7599 else {
7600 /* Extend string object */
7601 Py_ssize_t n = PyBytes_Size(*outbytes);
7602 if (n > PY_SSIZE_T_MAX - outsize) {
7603 PyErr_NoMemory();
7604 goto error;
7605 }
7606 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7607 goto error;
7608 out = PyBytes_AS_STRING(*outbytes) + n;
7609 }
7610
7611 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007612 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007614 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7615 wchar_t chars[2];
7616 int charsize;
7617 if (ch < 0x10000) {
7618 chars[0] = (wchar_t)ch;
7619 charsize = 1;
7620 }
7621 else {
7622 ch -= 0x10000;
7623 chars[0] = 0xd800 + (ch >> 10);
7624 chars[1] = 0xdc00 + (ch & 0x3ff);
7625 charsize = 2;
7626 }
7627
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 buffer, Py_ARRAY_LENGTH(buffer),
7631 NULL, pusedDefaultChar);
7632 if (outsize > 0) {
7633 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7634 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007635 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 memcpy(out, buffer, outsize);
7637 out += outsize;
7638 continue;
7639 }
7640 }
7641 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7642 PyErr_SetFromWindowsErr(0);
7643 goto error;
7644 }
7645
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 rep = unicode_encode_call_errorhandler(
7647 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007648 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007649 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 if (rep == NULL)
7651 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007652 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007653
7654 if (PyBytes_Check(rep)) {
7655 outsize = PyBytes_GET_SIZE(rep);
7656 if (outsize != 1) {
7657 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7658 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7659 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7660 Py_DECREF(rep);
7661 goto error;
7662 }
7663 out = PyBytes_AS_STRING(*outbytes) + offset;
7664 }
7665 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7666 out += outsize;
7667 }
7668 else {
7669 Py_ssize_t i;
7670 enum PyUnicode_Kind kind;
7671 void *data;
7672
Benjamin Petersonbac79492012-01-14 13:34:47 -05007673 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007674 Py_DECREF(rep);
7675 goto error;
7676 }
7677
7678 outsize = PyUnicode_GET_LENGTH(rep);
7679 if (outsize != 1) {
7680 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7681 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7682 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7683 Py_DECREF(rep);
7684 goto error;
7685 }
7686 out = PyBytes_AS_STRING(*outbytes) + offset;
7687 }
7688 kind = PyUnicode_KIND(rep);
7689 data = PyUnicode_DATA(rep);
7690 for (i=0; i < outsize; i++) {
7691 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7692 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007693 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007694 encoding, unicode,
7695 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007696 "unable to encode error handler result to ASCII");
7697 Py_DECREF(rep);
7698 goto error;
7699 }
7700 *out = (unsigned char)ch;
7701 out++;
7702 }
7703 }
7704 Py_DECREF(rep);
7705 }
7706 /* write a NUL byte */
7707 *out = 0;
7708 outsize = out - PyBytes_AS_STRING(*outbytes);
7709 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7710 if (_PyBytes_Resize(outbytes, outsize) < 0)
7711 goto error;
7712 ret = 0;
7713
7714error:
7715 Py_XDECREF(encoding_obj);
7716 Py_XDECREF(errorHandler);
7717 Py_XDECREF(exc);
7718 return ret;
7719}
7720
Victor Stinner3a50e702011-10-18 21:21:00 +02007721static PyObject *
7722encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007723 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007724 const char *errors)
7725{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007726 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007727 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007728 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007729 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007730
Benjamin Petersonbac79492012-01-14 13:34:47 -05007731 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007732 return NULL;
7733 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007734
Victor Stinner3a50e702011-10-18 21:21:00 +02007735 if (code_page < 0) {
7736 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7737 return NULL;
7738 }
7739
Martin v. Löwis3d325192011-11-04 18:23:06 +01007740 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007741 return PyBytes_FromStringAndSize(NULL, 0);
7742
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 offset = 0;
7744 do
7745 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007746#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007747 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007748 chunks. */
7749 if (len > INT_MAX/2) {
7750 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007751 done = 0;
7752 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007753 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007754#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007755 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007756 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007757 done = 1;
7758 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007759
Victor Stinner76a31a62011-11-04 00:05:13 +01007760 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007761 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007762 errors);
7763 if (ret == -2)
7764 ret = encode_code_page_errors(code_page, &outbytes,
7765 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007766 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007767 if (ret < 0) {
7768 Py_XDECREF(outbytes);
7769 return NULL;
7770 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007771
Victor Stinner7581cef2011-11-03 22:32:33 +01007772 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007773 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007774 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007775
Victor Stinner3a50e702011-10-18 21:21:00 +02007776 return outbytes;
7777}
7778
7779PyObject *
7780PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7781 Py_ssize_t size,
7782 const char *errors)
7783{
Victor Stinner7581cef2011-11-03 22:32:33 +01007784 PyObject *unicode, *res;
7785 unicode = PyUnicode_FromUnicode(p, size);
7786 if (unicode == NULL)
7787 return NULL;
7788 res = encode_code_page(CP_ACP, unicode, errors);
7789 Py_DECREF(unicode);
7790 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007791}
7792
7793PyObject *
7794PyUnicode_EncodeCodePage(int code_page,
7795 PyObject *unicode,
7796 const char *errors)
7797{
Victor Stinner7581cef2011-11-03 22:32:33 +01007798 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007799}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007800
Alexander Belopolsky40018472011-02-26 01:02:56 +00007801PyObject *
7802PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007803{
7804 if (!PyUnicode_Check(unicode)) {
7805 PyErr_BadArgument();
7806 return NULL;
7807 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007808 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007809}
7810
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007811#undef NEED_RETRY
7812
Victor Stinner99b95382011-07-04 14:23:54 +02007813#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007814
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815/* --- Character Mapping Codec -------------------------------------------- */
7816
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817PyObject *
7818PyUnicode_DecodeCharmap(const char *s,
7819 Py_ssize_t size,
7820 PyObject *mapping,
7821 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007824 Py_ssize_t startinpos;
7825 Py_ssize_t endinpos;
7826 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007828 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007829 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007830 PyObject *errorHandler = NULL;
7831 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007832
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 /* Default to Latin-1 */
7834 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007837 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007841 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007842 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007843 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007844 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007845 Py_ssize_t maplen;
7846 enum PyUnicode_Kind kind;
7847 void *data;
7848 Py_UCS4 x;
7849
Benjamin Petersonbac79492012-01-14 13:34:47 -05007850 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007851 return NULL;
7852
7853 maplen = PyUnicode_GET_LENGTH(mapping);
7854 data = PyUnicode_DATA(mapping);
7855 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 while (s < e) {
7857 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007860 x = PyUnicode_READ(kind, data, ch);
7861 else
7862 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007864 if (x == 0xfffe)
7865 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 startinpos = s-starts;
7868 endinpos = startinpos+1;
7869 if (unicode_decode_call_errorhandler(
7870 errors, &errorHandler,
7871 "charmap", "character maps to <undefined>",
7872 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007873 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 goto onError;
7875 }
7876 continue;
7877 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007878
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007879 if (unicode_putchar(&v, &outpos, x) < 0)
7880 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007882 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007883 }
7884 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 while (s < e) {
7886 unsigned char ch = *s;
7887 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007888
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7890 w = PyLong_FromLong((long)ch);
7891 if (w == NULL)
7892 goto onError;
7893 x = PyObject_GetItem(mapping, w);
7894 Py_DECREF(w);
7895 if (x == NULL) {
7896 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7897 /* No mapping found means: mapping is undefined. */
7898 PyErr_Clear();
7899 x = Py_None;
7900 Py_INCREF(x);
7901 } else
7902 goto onError;
7903 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007904
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 /* Apply mapping */
7906 if (PyLong_Check(x)) {
7907 long value = PyLong_AS_LONG(x);
7908 if (value < 0 || value > 65535) {
7909 PyErr_SetString(PyExc_TypeError,
7910 "character mapping must be in range(65536)");
7911 Py_DECREF(x);
7912 goto onError;
7913 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007914 if (unicode_putchar(&v, &outpos, value) < 0)
7915 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 }
7917 else if (x == Py_None) {
7918 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 startinpos = s-starts;
7920 endinpos = startinpos+1;
7921 if (unicode_decode_call_errorhandler(
7922 errors, &errorHandler,
7923 "charmap", "character maps to <undefined>",
7924 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007925 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 Py_DECREF(x);
7927 goto onError;
7928 }
7929 Py_DECREF(x);
7930 continue;
7931 }
7932 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007933 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007934
Benjamin Petersonbac79492012-01-14 13:34:47 -05007935 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007936 goto onError;
7937 targetsize = PyUnicode_GET_LENGTH(x);
7938
7939 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007941 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007942 PyUnicode_READ_CHAR(x, 0)) < 0)
7943 goto onError;
7944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 else if (targetsize > 1) {
7946 /* 1-n mapping */
7947 if (targetsize > extrachars) {
7948 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 Py_ssize_t needed = (targetsize - extrachars) + \
7950 (targetsize << 2);
7951 extrachars += needed;
7952 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007953 if (unicode_resize(&v,
7954 PyUnicode_GET_LENGTH(v) + needed) < 0)
7955 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 Py_DECREF(x);
7957 goto onError;
7958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007960 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7961 goto onError;
7962 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7963 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 extrachars -= targetsize;
7965 }
7966 /* 1-0 mapping: skip the character */
7967 }
7968 else {
7969 /* wrong return value */
7970 PyErr_SetString(PyExc_TypeError,
7971 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007972 Py_DECREF(x);
7973 goto onError;
7974 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 Py_DECREF(x);
7976 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007979 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007980 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007981 Py_XDECREF(errorHandler);
7982 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007983 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007984
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986 Py_XDECREF(errorHandler);
7987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 Py_XDECREF(v);
7989 return NULL;
7990}
7991
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007992/* Charmap encoding: the lookup table */
7993
Alexander Belopolsky40018472011-02-26 01:02:56 +00007994struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 PyObject_HEAD
7996 unsigned char level1[32];
7997 int count2, count3;
7998 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007999};
8000
8001static PyObject*
8002encoding_map_size(PyObject *obj, PyObject* args)
8003{
8004 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008005 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008007}
8008
8009static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008010 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 PyDoc_STR("Return the size (in bytes) of this object") },
8012 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008013};
8014
8015static void
8016encoding_map_dealloc(PyObject* o)
8017{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008018 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008019}
8020
8021static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008022 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 "EncodingMap", /*tp_name*/
8024 sizeof(struct encoding_map), /*tp_basicsize*/
8025 0, /*tp_itemsize*/
8026 /* methods */
8027 encoding_map_dealloc, /*tp_dealloc*/
8028 0, /*tp_print*/
8029 0, /*tp_getattr*/
8030 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008031 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 0, /*tp_repr*/
8033 0, /*tp_as_number*/
8034 0, /*tp_as_sequence*/
8035 0, /*tp_as_mapping*/
8036 0, /*tp_hash*/
8037 0, /*tp_call*/
8038 0, /*tp_str*/
8039 0, /*tp_getattro*/
8040 0, /*tp_setattro*/
8041 0, /*tp_as_buffer*/
8042 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8043 0, /*tp_doc*/
8044 0, /*tp_traverse*/
8045 0, /*tp_clear*/
8046 0, /*tp_richcompare*/
8047 0, /*tp_weaklistoffset*/
8048 0, /*tp_iter*/
8049 0, /*tp_iternext*/
8050 encoding_map_methods, /*tp_methods*/
8051 0, /*tp_members*/
8052 0, /*tp_getset*/
8053 0, /*tp_base*/
8054 0, /*tp_dict*/
8055 0, /*tp_descr_get*/
8056 0, /*tp_descr_set*/
8057 0, /*tp_dictoffset*/
8058 0, /*tp_init*/
8059 0, /*tp_alloc*/
8060 0, /*tp_new*/
8061 0, /*tp_free*/
8062 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008063};
8064
8065PyObject*
8066PyUnicode_BuildEncodingMap(PyObject* string)
8067{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008068 PyObject *result;
8069 struct encoding_map *mresult;
8070 int i;
8071 int need_dict = 0;
8072 unsigned char level1[32];
8073 unsigned char level2[512];
8074 unsigned char *mlevel1, *mlevel2, *mlevel3;
8075 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008076 int kind;
8077 void *data;
8078 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008080 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008081 PyErr_BadArgument();
8082 return NULL;
8083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008084 kind = PyUnicode_KIND(string);
8085 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086 memset(level1, 0xFF, sizeof level1);
8087 memset(level2, 0xFF, sizeof level2);
8088
8089 /* If there isn't a one-to-one mapping of NULL to \0,
8090 or if there are non-BMP characters, we need to use
8091 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008092 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093 need_dict = 1;
8094 for (i = 1; i < 256; i++) {
8095 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096 ch = PyUnicode_READ(kind, data, i);
8097 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 need_dict = 1;
8099 break;
8100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008101 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102 /* unmapped character */
8103 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104 l1 = ch >> 11;
8105 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 if (level1[l1] == 0xFF)
8107 level1[l1] = count2++;
8108 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008109 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 }
8111
8112 if (count2 >= 0xFF || count3 >= 0xFF)
8113 need_dict = 1;
8114
8115 if (need_dict) {
8116 PyObject *result = PyDict_New();
8117 PyObject *key, *value;
8118 if (!result)
8119 return NULL;
8120 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008121 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008122 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008123 if (!key || !value)
8124 goto failed1;
8125 if (PyDict_SetItem(result, key, value) == -1)
8126 goto failed1;
8127 Py_DECREF(key);
8128 Py_DECREF(value);
8129 }
8130 return result;
8131 failed1:
8132 Py_XDECREF(key);
8133 Py_XDECREF(value);
8134 Py_DECREF(result);
8135 return NULL;
8136 }
8137
8138 /* Create a three-level trie */
8139 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8140 16*count2 + 128*count3 - 1);
8141 if (!result)
8142 return PyErr_NoMemory();
8143 PyObject_Init(result, &EncodingMapType);
8144 mresult = (struct encoding_map*)result;
8145 mresult->count2 = count2;
8146 mresult->count3 = count3;
8147 mlevel1 = mresult->level1;
8148 mlevel2 = mresult->level23;
8149 mlevel3 = mresult->level23 + 16*count2;
8150 memcpy(mlevel1, level1, 32);
8151 memset(mlevel2, 0xFF, 16*count2);
8152 memset(mlevel3, 0, 128*count3);
8153 count3 = 0;
8154 for (i = 1; i < 256; i++) {
8155 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 /* unmapped character */
8158 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 o1 = PyUnicode_READ(kind, data, i)>>11;
8160 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 i2 = 16*mlevel1[o1] + o2;
8162 if (mlevel2[i2] == 0xFF)
8163 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008164 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 i3 = 128*mlevel2[i2] + o3;
8166 mlevel3[i3] = i;
8167 }
8168 return result;
8169}
8170
8171static int
Victor Stinner22168992011-11-20 17:09:18 +01008172encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008173{
8174 struct encoding_map *map = (struct encoding_map*)mapping;
8175 int l1 = c>>11;
8176 int l2 = (c>>7) & 0xF;
8177 int l3 = c & 0x7F;
8178 int i;
8179
Victor Stinner22168992011-11-20 17:09:18 +01008180 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182 if (c == 0)
8183 return 0;
8184 /* level 1*/
8185 i = map->level1[l1];
8186 if (i == 0xFF) {
8187 return -1;
8188 }
8189 /* level 2*/
8190 i = map->level23[16*i+l2];
8191 if (i == 0xFF) {
8192 return -1;
8193 }
8194 /* level 3 */
8195 i = map->level23[16*map->count2 + 128*i + l3];
8196 if (i == 0) {
8197 return -1;
8198 }
8199 return i;
8200}
8201
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008202/* Lookup the character ch in the mapping. If the character
8203 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008204 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008205static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008206charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207{
Christian Heimes217cfd12007-12-02 14:31:20 +00008208 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 PyObject *x;
8210
8211 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 x = PyObject_GetItem(mapping, w);
8214 Py_DECREF(w);
8215 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8217 /* No mapping found means: mapping is undefined. */
8218 PyErr_Clear();
8219 x = Py_None;
8220 Py_INCREF(x);
8221 return x;
8222 } else
8223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008225 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008227 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 long value = PyLong_AS_LONG(x);
8229 if (value < 0 || value > 255) {
8230 PyErr_SetString(PyExc_TypeError,
8231 "character mapping must be in range(256)");
8232 Py_DECREF(x);
8233 return NULL;
8234 }
8235 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008237 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 /* wrong return value */
8241 PyErr_Format(PyExc_TypeError,
8242 "character mapping must return integer, bytes or None, not %.400s",
8243 x->ob_type->tp_name);
8244 Py_DECREF(x);
8245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 }
8247}
8248
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008250charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008251{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008252 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8253 /* exponentially overallocate to minimize reallocations */
8254 if (requiredsize < 2*outsize)
8255 requiredsize = 2*outsize;
8256 if (_PyBytes_Resize(outobj, requiredsize))
8257 return -1;
8258 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008259}
8260
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008263} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008265 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266 space is available. Return a new reference to the object that
8267 was put in the output buffer, or Py_None, if the mapping was undefined
8268 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008269 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008270static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008271charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008272 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008274 PyObject *rep;
8275 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008276 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277
Christian Heimes90aa7642007-12-19 02:45:37 +00008278 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281 if (res == -1)
8282 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 if (outsize<requiredsize)
8284 if (charmapencode_resize(outobj, outpos, requiredsize))
8285 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008286 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 outstart[(*outpos)++] = (char)res;
8288 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008289 }
8290
8291 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008294 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 Py_DECREF(rep);
8296 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008297 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 if (PyLong_Check(rep)) {
8299 Py_ssize_t requiredsize = *outpos+1;
8300 if (outsize<requiredsize)
8301 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8302 Py_DECREF(rep);
8303 return enc_EXCEPTION;
8304 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008305 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008307 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 else {
8309 const char *repchars = PyBytes_AS_STRING(rep);
8310 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8311 Py_ssize_t requiredsize = *outpos+repsize;
8312 if (outsize<requiredsize)
8313 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8314 Py_DECREF(rep);
8315 return enc_EXCEPTION;
8316 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008317 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 memcpy(outstart + *outpos, repchars, repsize);
8319 *outpos += repsize;
8320 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008322 Py_DECREF(rep);
8323 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324}
8325
8326/* handle an error in PyUnicode_EncodeCharmap
8327 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328static int
8329charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008330 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008332 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008333 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334{
8335 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008336 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008337 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008338 enum PyUnicode_Kind kind;
8339 void *data;
8340 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008342 Py_ssize_t collstartpos = *inpos;
8343 Py_ssize_t collendpos = *inpos+1;
8344 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345 char *encoding = "charmap";
8346 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008348 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008349 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350
Benjamin Petersonbac79492012-01-14 13:34:47 -05008351 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008352 return -1;
8353 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 /* find all unencodable characters */
8355 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008357 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008358 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008359 val = encoding_map_lookup(ch, mapping);
8360 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 break;
8362 ++collendpos;
8363 continue;
8364 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008365
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008366 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8367 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 if (rep==NULL)
8369 return -1;
8370 else if (rep!=Py_None) {
8371 Py_DECREF(rep);
8372 break;
8373 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008374 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 }
8377 /* cache callback name lookup
8378 * (if not done yet, i.e. it's the first error) */
8379 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 if ((errors==NULL) || (!strcmp(errors, "strict")))
8381 *known_errorHandler = 1;
8382 else if (!strcmp(errors, "replace"))
8383 *known_errorHandler = 2;
8384 else if (!strcmp(errors, "ignore"))
8385 *known_errorHandler = 3;
8386 else if (!strcmp(errors, "xmlcharrefreplace"))
8387 *known_errorHandler = 4;
8388 else
8389 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 }
8391 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008392 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008393 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008394 return -1;
8395 case 2: /* replace */
8396 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 x = charmapencode_output('?', mapping, res, respos);
8398 if (x==enc_EXCEPTION) {
8399 return -1;
8400 }
8401 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008402 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 return -1;
8404 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008405 }
8406 /* fall through */
8407 case 3: /* ignore */
8408 *inpos = collendpos;
8409 break;
8410 case 4: /* xmlcharrefreplace */
8411 /* generate replacement (temporarily (mis)uses p) */
8412 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 char buffer[2+29+1+1];
8414 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008415 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 for (cp = buffer; *cp; ++cp) {
8417 x = charmapencode_output(*cp, mapping, res, respos);
8418 if (x==enc_EXCEPTION)
8419 return -1;
8420 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008421 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 return -1;
8423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 }
8425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426 *inpos = collendpos;
8427 break;
8428 default:
8429 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008430 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008434 if (PyBytes_Check(repunicode)) {
8435 /* Directly copy bytes result to output. */
8436 Py_ssize_t outsize = PyBytes_Size(*res);
8437 Py_ssize_t requiredsize;
8438 repsize = PyBytes_Size(repunicode);
8439 requiredsize = *respos + repsize;
8440 if (requiredsize > outsize)
8441 /* Make room for all additional bytes. */
8442 if (charmapencode_resize(res, respos, requiredsize)) {
8443 Py_DECREF(repunicode);
8444 return -1;
8445 }
8446 memcpy(PyBytes_AsString(*res) + *respos,
8447 PyBytes_AsString(repunicode), repsize);
8448 *respos += repsize;
8449 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008450 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008451 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008454 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008455 Py_DECREF(repunicode);
8456 return -1;
8457 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008458 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008459 data = PyUnicode_DATA(repunicode);
8460 kind = PyUnicode_KIND(repunicode);
8461 for (index = 0; index < repsize; index++) {
8462 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8463 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008465 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 return -1;
8467 }
8468 else if (x==enc_FAILED) {
8469 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008470 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 return -1;
8472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 }
8474 *inpos = newpos;
8475 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476 }
8477 return 0;
8478}
8479
Alexander Belopolsky40018472011-02-26 01:02:56 +00008480PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008481_PyUnicode_EncodeCharmap(PyObject *unicode,
8482 PyObject *mapping,
8483 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485 /* output object */
8486 PyObject *res = NULL;
8487 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008488 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008489 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008491 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 PyObject *errorHandler = NULL;
8493 PyObject *exc = NULL;
8494 /* the following variable is used for caching string comparisons
8495 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8496 * 3=ignore, 4=xmlcharrefreplace */
8497 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498
Benjamin Petersonbac79492012-01-14 13:34:47 -05008499 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008500 return NULL;
8501 size = PyUnicode_GET_LENGTH(unicode);
8502
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503 /* Default to Latin-1 */
8504 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008505 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 /* allocate enough for a simple encoding without
8508 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008509 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510 if (res == NULL)
8511 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008512 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008516 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008518 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 if (x==enc_EXCEPTION) /* error */
8520 goto onError;
8521 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008522 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 &exc,
8524 &known_errorHandler, &errorHandler, errors,
8525 &res, &respos)) {
8526 goto onError;
8527 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 else
8530 /* done with this character => adjust input position */
8531 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008535 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008536 if (_PyBytes_Resize(&res, respos) < 0)
8537 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 Py_XDECREF(exc);
8540 Py_XDECREF(errorHandler);
8541 return res;
8542
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 Py_XDECREF(res);
8545 Py_XDECREF(exc);
8546 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 return NULL;
8548}
8549
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550/* Deprecated */
8551PyObject *
8552PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8553 Py_ssize_t size,
8554 PyObject *mapping,
8555 const char *errors)
8556{
8557 PyObject *result;
8558 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8559 if (unicode == NULL)
8560 return NULL;
8561 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8562 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008563 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564}
8565
Alexander Belopolsky40018472011-02-26 01:02:56 +00008566PyObject *
8567PyUnicode_AsCharmapString(PyObject *unicode,
8568 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569{
8570 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 PyErr_BadArgument();
8572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008574 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575}
8576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008578static void
8579make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008581 Py_ssize_t startpos, Py_ssize_t endpos,
8582 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 *exceptionObject = _PyUnicodeTranslateError_Create(
8586 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 }
8588 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8590 goto onError;
8591 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8592 goto onError;
8593 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8594 goto onError;
8595 return;
8596 onError:
8597 Py_DECREF(*exceptionObject);
8598 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 }
8600}
8601
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008603static void
8604raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008606 Py_ssize_t startpos, Py_ssize_t endpos,
8607 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608{
8609 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613}
8614
8615/* error handling callback helper:
8616 build arguments, call the callback and check the arguments,
8617 put the result into newpos and return the replacement string, which
8618 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008619static PyObject *
8620unicode_translate_call_errorhandler(const char *errors,
8621 PyObject **errorHandler,
8622 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008624 Py_ssize_t startpos, Py_ssize_t endpos,
8625 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008627 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008629 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630 PyObject *restuple;
8631 PyObject *resunicode;
8632
8633 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 }
8638
8639 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643
8644 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008649 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 Py_DECREF(restuple);
8651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652 }
8653 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 &resunicode, &i_newpos)) {
8655 Py_DECREF(restuple);
8656 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008658 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008660 else
8661 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8664 Py_DECREF(restuple);
8665 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008666 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667 Py_INCREF(resunicode);
8668 Py_DECREF(restuple);
8669 return resunicode;
8670}
8671
8672/* Lookup the character ch in the mapping and put the result in result,
8673 which must be decrefed by the caller.
8674 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008675static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677{
Christian Heimes217cfd12007-12-02 14:31:20 +00008678 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 PyObject *x;
8680
8681 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 x = PyObject_GetItem(mapping, w);
8684 Py_DECREF(w);
8685 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8687 /* No mapping found means: use 1:1 mapping. */
8688 PyErr_Clear();
8689 *result = NULL;
8690 return 0;
8691 } else
8692 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 }
8694 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 *result = x;
8696 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008698 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 long value = PyLong_AS_LONG(x);
8700 long max = PyUnicode_GetMax();
8701 if (value < 0 || value > max) {
8702 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008703 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 Py_DECREF(x);
8705 return -1;
8706 }
8707 *result = x;
8708 return 0;
8709 }
8710 else if (PyUnicode_Check(x)) {
8711 *result = x;
8712 return 0;
8713 }
8714 else {
8715 /* wrong return value */
8716 PyErr_SetString(PyExc_TypeError,
8717 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008718 Py_DECREF(x);
8719 return -1;
8720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721}
8722/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 if not reallocate and adjust various state variables.
8724 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008725static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008730 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 /* exponentially overallocate to minimize reallocations */
8732 if (requiredsize < 2 * oldsize)
8733 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8735 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 }
8739 return 0;
8740}
8741/* lookup the character, put the result in the output string and adjust
8742 various state variables. Return a new reference to the object that
8743 was put in the output buffer in *result, or Py_None, if the mapping was
8744 undefined (in which case no character was written).
8745 The called must decref result.
8746 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008747static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8749 PyObject *mapping, Py_UCS4 **output,
8750 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008751 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8754 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759 }
8760 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008762 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008765 }
8766 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 Py_ssize_t repsize;
8768 if (PyUnicode_READY(*res) == -1)
8769 return -1;
8770 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 if (repsize==1) {
8772 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 }
8775 else if (repsize!=0) {
8776 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 Py_ssize_t requiredsize = *opos +
8778 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 Py_ssize_t i;
8781 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 for(i = 0; i < repsize; i++)
8784 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008786 }
8787 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789 return 0;
8790}
8791
Alexander Belopolsky40018472011-02-26 01:02:56 +00008792PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793_PyUnicode_TranslateCharmap(PyObject *input,
8794 PyObject *mapping,
8795 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 /* input object */
8798 char *idata;
8799 Py_ssize_t size, i;
8800 int kind;
8801 /* output buffer */
8802 Py_UCS4 *output = NULL;
8803 Py_ssize_t osize;
8804 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807 char *reason = "character maps to <undefined>";
8808 PyObject *errorHandler = NULL;
8809 PyObject *exc = NULL;
8810 /* the following variable is used for caching string comparisons
8811 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8812 * 3=ignore, 4=xmlcharrefreplace */
8813 int known_errorHandler = -1;
8814
Guido van Rossumd57fd912000-03-10 22:53:23 +00008815 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 PyErr_BadArgument();
8817 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 if (PyUnicode_READY(input) == -1)
8821 return NULL;
8822 idata = (char*)PyUnicode_DATA(input);
8823 kind = PyUnicode_KIND(input);
8824 size = PyUnicode_GET_LENGTH(input);
8825 i = 0;
8826
8827 if (size == 0) {
8828 Py_INCREF(input);
8829 return input;
8830 }
8831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008832 /* allocate enough for a simple 1:1 translation without
8833 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 osize = size;
8835 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8836 opos = 0;
8837 if (output == NULL) {
8838 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 /* try to encode it */
8844 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 if (charmaptranslate_output(input, i, mapping,
8846 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 Py_XDECREF(x);
8848 goto onError;
8849 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008850 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 else { /* untranslatable character */
8854 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8855 Py_ssize_t repsize;
8856 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859 Py_ssize_t collstart = i;
8860 Py_ssize_t collend = i+1;
8861 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 while (collend < size) {
8865 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 goto onError;
8867 Py_XDECREF(x);
8868 if (x!=Py_None)
8869 break;
8870 ++collend;
8871 }
8872 /* cache callback name lookup
8873 * (if not done yet, i.e. it's the first error) */
8874 if (known_errorHandler==-1) {
8875 if ((errors==NULL) || (!strcmp(errors, "strict")))
8876 known_errorHandler = 1;
8877 else if (!strcmp(errors, "replace"))
8878 known_errorHandler = 2;
8879 else if (!strcmp(errors, "ignore"))
8880 known_errorHandler = 3;
8881 else if (!strcmp(errors, "xmlcharrefreplace"))
8882 known_errorHandler = 4;
8883 else
8884 known_errorHandler = 0;
8885 }
8886 switch (known_errorHandler) {
8887 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 raise_translate_exception(&exc, input, collstart,
8889 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008890 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 case 2: /* replace */
8892 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 for (coll = collstart; coll<collend; coll++)
8894 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 /* fall through */
8896 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 break;
8899 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 /* generate replacement (temporarily (mis)uses i) */
8901 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 char buffer[2+29+1+1];
8903 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8905 if (charmaptranslate_makespace(&output, &osize,
8906 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 goto onError;
8908 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 break;
8913 default:
8914 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 reason, input, &exc,
8916 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008917 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008919 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008920 Py_DECREF(repunicode);
8921 goto onError;
8922 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 repsize = PyUnicode_GET_LENGTH(repunicode);
8925 if (charmaptranslate_makespace(&output, &osize,
8926 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 Py_DECREF(repunicode);
8928 goto onError;
8929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 for (uni2 = 0; repsize-->0; ++uni2)
8931 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8932 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008935 }
8936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8938 if (!res)
8939 goto onError;
8940 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941 Py_XDECREF(exc);
8942 Py_XDECREF(errorHandler);
8943 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008947 Py_XDECREF(exc);
8948 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 return NULL;
8950}
8951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952/* Deprecated. Use PyUnicode_Translate instead. */
8953PyObject *
8954PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8955 Py_ssize_t size,
8956 PyObject *mapping,
8957 const char *errors)
8958{
8959 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8960 if (!unicode)
8961 return NULL;
8962 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8963}
8964
Alexander Belopolsky40018472011-02-26 01:02:56 +00008965PyObject *
8966PyUnicode_Translate(PyObject *str,
8967 PyObject *mapping,
8968 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969{
8970 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008971
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 str = PyUnicode_FromObject(str);
8973 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 Py_DECREF(str);
8977 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008978
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 Py_XDECREF(str);
8981 return NULL;
8982}
Tim Petersced69f82003-09-16 20:30:58 +00008983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008985fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986{
8987 /* No need to call PyUnicode_READY(self) because this function is only
8988 called as a callback from fixup() which does it already. */
8989 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8990 const int kind = PyUnicode_KIND(self);
8991 void *data = PyUnicode_DATA(self);
8992 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008993 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 Py_ssize_t i;
8995
8996 for (i = 0; i < len; ++i) {
8997 ch = PyUnicode_READ(kind, data, i);
8998 fixed = 0;
8999 if (ch > 127) {
9000 if (Py_UNICODE_ISSPACE(ch))
9001 fixed = ' ';
9002 else {
9003 const int decimal = Py_UNICODE_TODECIMAL(ch);
9004 if (decimal >= 0)
9005 fixed = '0' + decimal;
9006 }
9007 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009008 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (fixed > maxchar)
9010 maxchar = fixed;
9011 PyUnicode_WRITE(kind, data, i, fixed);
9012 }
9013 else if (ch > maxchar)
9014 maxchar = ch;
9015 }
9016 else if (ch > maxchar)
9017 maxchar = ch;
9018 }
9019
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009020 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021}
9022
9023PyObject *
9024_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9025{
9026 if (!PyUnicode_Check(unicode)) {
9027 PyErr_BadInternalCall();
9028 return NULL;
9029 }
9030 if (PyUnicode_READY(unicode) == -1)
9031 return NULL;
9032 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9033 /* If the string is already ASCII, just return the same string */
9034 Py_INCREF(unicode);
9035 return unicode;
9036 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009037 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038}
9039
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009040PyObject *
9041PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9042 Py_ssize_t length)
9043{
Victor Stinnerf0124502011-11-21 23:12:56 +01009044 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009045 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009046 Py_UCS4 maxchar;
9047 enum PyUnicode_Kind kind;
9048 void *data;
9049
Victor Stinner99d7ad02012-02-22 13:37:39 +01009050 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009051 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009052 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009053 if (ch > 127) {
9054 int decimal = Py_UNICODE_TODECIMAL(ch);
9055 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009056 ch = '0' + decimal;
Victor Stinner99d7ad02012-02-22 13:37:39 +01009057 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009058 }
9059 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009060
9061 /* Copy to a new string */
9062 decimal = PyUnicode_New(length, maxchar);
9063 if (decimal == NULL)
9064 return decimal;
9065 kind = PyUnicode_KIND(decimal);
9066 data = PyUnicode_DATA(decimal);
9067 /* Iterate over code points */
9068 for (i = 0; i < length; i++) {
9069 Py_UNICODE ch = s[i];
9070 if (ch > 127) {
9071 int decimal = Py_UNICODE_TODECIMAL(ch);
9072 if (decimal >= 0)
9073 ch = '0' + decimal;
9074 }
9075 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009077 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009078}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009079/* --- Decimal Encoder ---------------------------------------------------- */
9080
Alexander Belopolsky40018472011-02-26 01:02:56 +00009081int
9082PyUnicode_EncodeDecimal(Py_UNICODE *s,
9083 Py_ssize_t length,
9084 char *output,
9085 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009086{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009087 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009088 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009089 enum PyUnicode_Kind kind;
9090 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009091
9092 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 PyErr_BadArgument();
9094 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009095 }
9096
Victor Stinner42bf7752011-11-21 22:52:58 +01009097 unicode = PyUnicode_FromUnicode(s, length);
9098 if (unicode == NULL)
9099 return -1;
9100
Benjamin Petersonbac79492012-01-14 13:34:47 -05009101 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009102 Py_DECREF(unicode);
9103 return -1;
9104 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009105 kind = PyUnicode_KIND(unicode);
9106 data = PyUnicode_DATA(unicode);
9107
Victor Stinnerb84d7232011-11-22 01:50:07 +01009108 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009109 PyObject *exc;
9110 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009112 Py_ssize_t startpos;
9113
9114 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009115
Benjamin Peterson29060642009-01-31 22:14:21 +00009116 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009117 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009118 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009120 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 decimal = Py_UNICODE_TODECIMAL(ch);
9122 if (decimal >= 0) {
9123 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009124 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 continue;
9126 }
9127 if (0 < ch && ch < 256) {
9128 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009129 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 continue;
9131 }
Victor Stinner6345be92011-11-25 20:09:01 +01009132
Victor Stinner42bf7752011-11-21 22:52:58 +01009133 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009134 exc = NULL;
9135 raise_encode_exception(&exc, "decimal", unicode,
9136 startpos, startpos+1,
9137 "invalid decimal Unicode string");
9138 Py_XDECREF(exc);
9139 Py_DECREF(unicode);
9140 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009141 }
9142 /* 0-terminate the output string */
9143 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009144 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009145 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009146}
9147
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148/* --- Helpers ------------------------------------------------------------ */
9149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009151any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 Py_ssize_t start,
9153 Py_ssize_t end)
9154{
9155 int kind1, kind2, kind;
9156 void *buf1, *buf2;
9157 Py_ssize_t len1, len2, result;
9158
9159 kind1 = PyUnicode_KIND(s1);
9160 kind2 = PyUnicode_KIND(s2);
9161 kind = kind1 > kind2 ? kind1 : kind2;
9162 buf1 = PyUnicode_DATA(s1);
9163 buf2 = PyUnicode_DATA(s2);
9164 if (kind1 != kind)
9165 buf1 = _PyUnicode_AsKind(s1, kind);
9166 if (!buf1)
9167 return -2;
9168 if (kind2 != kind)
9169 buf2 = _PyUnicode_AsKind(s2, kind);
9170 if (!buf2) {
9171 if (kind1 != kind) PyMem_Free(buf1);
9172 return -2;
9173 }
9174 len1 = PyUnicode_GET_LENGTH(s1);
9175 len2 = PyUnicode_GET_LENGTH(s2);
9176
Victor Stinner794d5672011-10-10 03:21:36 +02009177 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009178 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009179 case PyUnicode_1BYTE_KIND:
9180 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9181 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9182 else
9183 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9184 break;
9185 case PyUnicode_2BYTE_KIND:
9186 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9187 break;
9188 case PyUnicode_4BYTE_KIND:
9189 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9190 break;
9191 default:
9192 assert(0); result = -2;
9193 }
9194 }
9195 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009196 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009197 case PyUnicode_1BYTE_KIND:
9198 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9199 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9200 else
9201 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9202 break;
9203 case PyUnicode_2BYTE_KIND:
9204 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9205 break;
9206 case PyUnicode_4BYTE_KIND:
9207 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9208 break;
9209 default:
9210 assert(0); result = -2;
9211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 }
9213
9214 if (kind1 != kind)
9215 PyMem_Free(buf1);
9216 if (kind2 != kind)
9217 PyMem_Free(buf2);
9218
9219 return result;
9220}
9221
9222Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009223_PyUnicode_InsertThousandsGrouping(
9224 PyObject *unicode, Py_ssize_t index,
9225 Py_ssize_t n_buffer,
9226 void *digits, Py_ssize_t n_digits,
9227 Py_ssize_t min_width,
9228 const char *grouping, PyObject *thousands_sep,
9229 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230{
Victor Stinner41a863c2012-02-24 00:37:51 +01009231 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009232 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009233 Py_ssize_t thousands_sep_len;
9234 Py_ssize_t len;
9235
9236 if (unicode != NULL) {
9237 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009238 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009239 }
9240 else {
9241 kind = PyUnicode_1BYTE_KIND;
9242 data = NULL;
9243 }
9244 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9245 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9246 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9247 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009248 if (thousands_sep_kind < kind) {
9249 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9250 if (!thousands_sep_data)
9251 return -1;
9252 }
9253 else {
9254 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9255 if (!data)
9256 return -1;
9257 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009258 }
9259
Benjamin Petersonead6b532011-12-20 17:23:42 -06009260 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009262 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009263 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009264 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009265 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009266 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009267 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009268 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009269 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009270 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009271 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009272 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009274 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009275 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009276 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009277 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009278 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009280 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009281 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009282 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009283 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009284 break;
9285 default:
9286 assert(0);
9287 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009289 if (unicode != NULL && thousands_sep_kind != kind) {
9290 if (thousands_sep_kind < kind)
9291 PyMem_Free(thousands_sep_data);
9292 else
9293 PyMem_Free(data);
9294 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009295 if (unicode == NULL) {
9296 *maxchar = 127;
9297 if (len != n_digits) {
9298 *maxchar = Py_MAX(*maxchar,
9299 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9300 }
9301 }
9302 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303}
9304
9305
Thomas Wouters477c8d52006-05-27 19:21:47 +00009306/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009307#define ADJUST_INDICES(start, end, len) \
9308 if (end > len) \
9309 end = len; \
9310 else if (end < 0) { \
9311 end += len; \
9312 if (end < 0) \
9313 end = 0; \
9314 } \
9315 if (start < 0) { \
9316 start += len; \
9317 if (start < 0) \
9318 start = 0; \
9319 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009320
Alexander Belopolsky40018472011-02-26 01:02:56 +00009321Py_ssize_t
9322PyUnicode_Count(PyObject *str,
9323 PyObject *substr,
9324 Py_ssize_t start,
9325 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009327 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009328 PyObject* str_obj;
9329 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 int kind1, kind2, kind;
9331 void *buf1 = NULL, *buf2 = NULL;
9332 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009333
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009334 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009335 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009336 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009337 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009338 if (!sub_obj) {
9339 Py_DECREF(str_obj);
9340 return -1;
9341 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009342 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009343 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 Py_DECREF(str_obj);
9345 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 }
Tim Petersced69f82003-09-16 20:30:58 +00009347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 kind1 = PyUnicode_KIND(str_obj);
9349 kind2 = PyUnicode_KIND(sub_obj);
9350 kind = kind1 > kind2 ? kind1 : kind2;
9351 buf1 = PyUnicode_DATA(str_obj);
9352 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009353 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 if (!buf1)
9355 goto onError;
9356 buf2 = PyUnicode_DATA(sub_obj);
9357 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009358 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 if (!buf2)
9360 goto onError;
9361 len1 = PyUnicode_GET_LENGTH(str_obj);
9362 len2 = PyUnicode_GET_LENGTH(sub_obj);
9363
9364 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009365 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009367 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9368 result = asciilib_count(
9369 ((Py_UCS1*)buf1) + start, end - start,
9370 buf2, len2, PY_SSIZE_T_MAX
9371 );
9372 else
9373 result = ucs1lib_count(
9374 ((Py_UCS1*)buf1) + start, end - start,
9375 buf2, len2, PY_SSIZE_T_MAX
9376 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 break;
9378 case PyUnicode_2BYTE_KIND:
9379 result = ucs2lib_count(
9380 ((Py_UCS2*)buf1) + start, end - start,
9381 buf2, len2, PY_SSIZE_T_MAX
9382 );
9383 break;
9384 case PyUnicode_4BYTE_KIND:
9385 result = ucs4lib_count(
9386 ((Py_UCS4*)buf1) + start, end - start,
9387 buf2, len2, PY_SSIZE_T_MAX
9388 );
9389 break;
9390 default:
9391 assert(0); result = 0;
9392 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009393
9394 Py_DECREF(sub_obj);
9395 Py_DECREF(str_obj);
9396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 if (kind1 != kind)
9398 PyMem_Free(buf1);
9399 if (kind2 != kind)
9400 PyMem_Free(buf2);
9401
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 onError:
9404 Py_DECREF(sub_obj);
9405 Py_DECREF(str_obj);
9406 if (kind1 != kind && buf1)
9407 PyMem_Free(buf1);
9408 if (kind2 != kind && buf2)
9409 PyMem_Free(buf2);
9410 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411}
9412
Alexander Belopolsky40018472011-02-26 01:02:56 +00009413Py_ssize_t
9414PyUnicode_Find(PyObject *str,
9415 PyObject *sub,
9416 Py_ssize_t start,
9417 Py_ssize_t end,
9418 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009420 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009421
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009423 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009424 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009425 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009426 if (!sub) {
9427 Py_DECREF(str);
9428 return -2;
9429 }
9430 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9431 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009432 Py_DECREF(str);
9433 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 }
Tim Petersced69f82003-09-16 20:30:58 +00009435
Victor Stinner794d5672011-10-10 03:21:36 +02009436 result = any_find_slice(direction,
9437 str, sub, start, end
9438 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009439
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009441 Py_DECREF(sub);
9442
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 return result;
9444}
9445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446Py_ssize_t
9447PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9448 Py_ssize_t start, Py_ssize_t end,
9449 int direction)
9450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009452 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 if (PyUnicode_READY(str) == -1)
9454 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009455 if (start < 0 || end < 0) {
9456 PyErr_SetString(PyExc_IndexError, "string index out of range");
9457 return -2;
9458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 if (end > PyUnicode_GET_LENGTH(str))
9460 end = PyUnicode_GET_LENGTH(str);
9461 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009462 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9463 kind, end-start, ch, direction);
9464 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009466 else
9467 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468}
9469
Alexander Belopolsky40018472011-02-26 01:02:56 +00009470static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009471tailmatch(PyObject *self,
9472 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009473 Py_ssize_t start,
9474 Py_ssize_t end,
9475 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 int kind_self;
9478 int kind_sub;
9479 void *data_self;
9480 void *data_sub;
9481 Py_ssize_t offset;
9482 Py_ssize_t i;
9483 Py_ssize_t end_sub;
9484
9485 if (PyUnicode_READY(self) == -1 ||
9486 PyUnicode_READY(substring) == -1)
9487 return 0;
9488
9489 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 return 1;
9491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9493 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 kind_self = PyUnicode_KIND(self);
9498 data_self = PyUnicode_DATA(self);
9499 kind_sub = PyUnicode_KIND(substring);
9500 data_sub = PyUnicode_DATA(substring);
9501 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9502
9503 if (direction > 0)
9504 offset = end;
9505 else
9506 offset = start;
9507
9508 if (PyUnicode_READ(kind_self, data_self, offset) ==
9509 PyUnicode_READ(kind_sub, data_sub, 0) &&
9510 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9511 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9512 /* If both are of the same kind, memcmp is sufficient */
9513 if (kind_self == kind_sub) {
9514 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009515 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 data_sub,
9517 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009518 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 }
9520 /* otherwise we have to compare each character by first accesing it */
9521 else {
9522 /* We do not need to compare 0 and len(substring)-1 because
9523 the if statement above ensured already that they are equal
9524 when we end up here. */
9525 // TODO: honor direction and do a forward or backwards search
9526 for (i = 1; i < end_sub; ++i) {
9527 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9528 PyUnicode_READ(kind_sub, data_sub, i))
9529 return 0;
9530 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 }
9534
9535 return 0;
9536}
9537
Alexander Belopolsky40018472011-02-26 01:02:56 +00009538Py_ssize_t
9539PyUnicode_Tailmatch(PyObject *str,
9540 PyObject *substr,
9541 Py_ssize_t start,
9542 Py_ssize_t end,
9543 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009545 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009546
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 str = PyUnicode_FromObject(str);
9548 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550 substr = PyUnicode_FromObject(substr);
9551 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009552 Py_DECREF(str);
9553 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554 }
Tim Petersced69f82003-09-16 20:30:58 +00009555
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009556 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009557 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558 Py_DECREF(str);
9559 Py_DECREF(substr);
9560 return result;
9561}
9562
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563/* Apply fixfct filter to the Unicode object self and return a
9564 reference to the modified object */
9565
Alexander Belopolsky40018472011-02-26 01:02:56 +00009566static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009567fixup(PyObject *self,
9568 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 PyObject *u;
9571 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009572 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009574 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009577 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 /* fix functions return the new maximum character in a string,
9580 if the kind of the resulting unicode object does not change,
9581 everything is fine. Otherwise we need to change the string kind
9582 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009583 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009584
9585 if (maxchar_new == 0) {
9586 /* no changes */;
9587 if (PyUnicode_CheckExact(self)) {
9588 Py_DECREF(u);
9589 Py_INCREF(self);
9590 return self;
9591 }
9592 else
9593 return u;
9594 }
9595
9596 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 maxchar_new = 127;
9598 else if (maxchar_new <= 255)
9599 maxchar_new = 255;
9600 else if (maxchar_new <= 65535)
9601 maxchar_new = 65535;
9602 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009603 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604
Victor Stinnereaab6042011-12-11 22:22:39 +01009605 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009607
9608 /* In case the maximum character changed, we need to
9609 convert the string to the new category. */
9610 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9611 if (v == NULL) {
9612 Py_DECREF(u);
9613 return NULL;
9614 }
9615 if (maxchar_new > maxchar_old) {
9616 /* If the maxchar increased so that the kind changed, not all
9617 characters are representable anymore and we need to fix the
9618 string again. This only happens in very few cases. */
9619 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9620 maxchar_old = fixfct(v);
9621 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 }
9623 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009624 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009626 Py_DECREF(u);
9627 assert(_PyUnicode_CheckConsistency(v, 1));
9628 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629}
9630
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009631static PyObject *
9632ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009634 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9635 char *resdata, *data = PyUnicode_DATA(self);
9636 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009637
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009638 res = PyUnicode_New(len, 127);
9639 if (res == NULL)
9640 return NULL;
9641 resdata = PyUnicode_DATA(res);
9642 if (lower)
9643 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009645 _Py_bytes_upper(resdata, data, len);
9646 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647}
9648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652 Py_ssize_t j;
9653 int final_sigma;
9654 Py_UCS4 c;
9655 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009656
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9658
9659 where ! is a negation and \p{xxx} is a character with property xxx.
9660 */
9661 for (j = i - 1; j >= 0; j--) {
9662 c = PyUnicode_READ(kind, data, j);
9663 if (!_PyUnicode_IsCaseIgnorable(c))
9664 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9667 if (final_sigma) {
9668 for (j = i + 1; j < length; j++) {
9669 c = PyUnicode_READ(kind, data, j);
9670 if (!_PyUnicode_IsCaseIgnorable(c))
9671 break;
9672 }
9673 final_sigma = j == length || !_PyUnicode_IsCased(c);
9674 }
9675 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676}
9677
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678static int
9679lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9680 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009682 /* Obscure special case. */
9683 if (c == 0x3A3) {
9684 mapped[0] = handle_capital_sigma(kind, data, length, i);
9685 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009687 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688}
9689
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690static Py_ssize_t
9691do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 Py_ssize_t i, k = 0;
9694 int n_res, j;
9695 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009696
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009697 c = PyUnicode_READ(kind, data, 0);
9698 n_res = _PyUnicode_ToUpperFull(c, mapped);
9699 for (j = 0; j < n_res; j++) {
9700 if (mapped[j] > *maxchar)
9701 *maxchar = mapped[j];
9702 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 for (i = 1; i < length; i++) {
9705 c = PyUnicode_READ(kind, data, i);
9706 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9707 for (j = 0; j < n_res; j++) {
9708 if (mapped[j] > *maxchar)
9709 *maxchar = mapped[j];
9710 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009711 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009712 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714}
9715
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716static Py_ssize_t
9717do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9718 Py_ssize_t i, k = 0;
9719
9720 for (i = 0; i < length; i++) {
9721 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9722 int n_res, j;
9723 if (Py_UNICODE_ISUPPER(c)) {
9724 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9725 }
9726 else if (Py_UNICODE_ISLOWER(c)) {
9727 n_res = _PyUnicode_ToUpperFull(c, mapped);
9728 }
9729 else {
9730 n_res = 1;
9731 mapped[0] = c;
9732 }
9733 for (j = 0; j < n_res; j++) {
9734 if (mapped[j] > *maxchar)
9735 *maxchar = mapped[j];
9736 res[k++] = mapped[j];
9737 }
9738 }
9739 return k;
9740}
9741
9742static Py_ssize_t
9743do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9744 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009746 Py_ssize_t i, k = 0;
9747
9748 for (i = 0; i < length; i++) {
9749 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9750 int n_res, j;
9751 if (lower)
9752 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9753 else
9754 n_res = _PyUnicode_ToUpperFull(c, mapped);
9755 for (j = 0; j < n_res; j++) {
9756 if (mapped[j] > *maxchar)
9757 *maxchar = mapped[j];
9758 res[k++] = mapped[j];
9759 }
9760 }
9761 return k;
9762}
9763
9764static Py_ssize_t
9765do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9766{
9767 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9768}
9769
9770static Py_ssize_t
9771do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9772{
9773 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9774}
9775
Benjamin Petersone51757f2012-01-12 21:10:29 -05009776static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009777do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9778{
9779 Py_ssize_t i, k = 0;
9780
9781 for (i = 0; i < length; i++) {
9782 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9783 Py_UCS4 mapped[3];
9784 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9785 for (j = 0; j < n_res; j++) {
9786 if (mapped[j] > *maxchar)
9787 *maxchar = mapped[j];
9788 res[k++] = mapped[j];
9789 }
9790 }
9791 return k;
9792}
9793
9794static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009795do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9796{
9797 Py_ssize_t i, k = 0;
9798 int previous_is_cased;
9799
9800 previous_is_cased = 0;
9801 for (i = 0; i < length; i++) {
9802 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9803 Py_UCS4 mapped[3];
9804 int n_res, j;
9805
9806 if (previous_is_cased)
9807 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9808 else
9809 n_res = _PyUnicode_ToTitleFull(c, mapped);
9810
9811 for (j = 0; j < n_res; j++) {
9812 if (mapped[j] > *maxchar)
9813 *maxchar = mapped[j];
9814 res[k++] = mapped[j];
9815 }
9816
9817 previous_is_cased = _PyUnicode_IsCased(c);
9818 }
9819 return k;
9820}
9821
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822static PyObject *
9823case_operation(PyObject *self,
9824 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9825{
9826 PyObject *res = NULL;
9827 Py_ssize_t length, newlength = 0;
9828 int kind, outkind;
9829 void *data, *outdata;
9830 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9831
Benjamin Petersoneea48462012-01-16 14:28:50 -05009832 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833
9834 kind = PyUnicode_KIND(self);
9835 data = PyUnicode_DATA(self);
9836 length = PyUnicode_GET_LENGTH(self);
9837 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9838 if (tmp == NULL)
9839 return PyErr_NoMemory();
9840 newlength = perform(kind, data, length, tmp, &maxchar);
9841 res = PyUnicode_New(newlength, maxchar);
9842 if (res == NULL)
9843 goto leave;
9844 tmpend = tmp + newlength;
9845 outdata = PyUnicode_DATA(res);
9846 outkind = PyUnicode_KIND(res);
9847 switch (outkind) {
9848 case PyUnicode_1BYTE_KIND:
9849 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9850 break;
9851 case PyUnicode_2BYTE_KIND:
9852 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9853 break;
9854 case PyUnicode_4BYTE_KIND:
9855 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9856 break;
9857 default:
9858 assert(0);
9859 break;
9860 }
9861 leave:
9862 PyMem_FREE(tmp);
9863 return res;
9864}
9865
Tim Peters8ce9f162004-08-27 01:49:32 +00009866PyObject *
9867PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009870 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009872 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009873 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9874 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009875 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009877 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009879 int use_memcpy;
9880 unsigned char *res_data = NULL, *sep_data = NULL;
9881 PyObject *last_obj;
9882 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883
Tim Peters05eba1f2004-08-27 21:32:02 +00009884 fseq = PySequence_Fast(seq, "");
9885 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009886 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009887 }
9888
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009889 /* NOTE: the following code can't call back into Python code,
9890 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009891 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009892
Tim Peters05eba1f2004-08-27 21:32:02 +00009893 seqlen = PySequence_Fast_GET_SIZE(fseq);
9894 /* If empty sequence, return u"". */
9895 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009896 Py_DECREF(fseq);
9897 Py_INCREF(unicode_empty);
9898 res = unicode_empty;
9899 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009900 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009901
Tim Peters05eba1f2004-08-27 21:32:02 +00009902 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009903 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009904 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009905 if (seqlen == 1) {
9906 if (PyUnicode_CheckExact(items[0])) {
9907 res = items[0];
9908 Py_INCREF(res);
9909 Py_DECREF(fseq);
9910 return res;
9911 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009912 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009913 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009914 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009915 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009916 /* Set up sep and seplen */
9917 if (separator == NULL) {
9918 /* fall back to a blank space separator */
9919 sep = PyUnicode_FromOrdinal(' ');
9920 if (!sep)
9921 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009922 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009923 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009924 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009925 else {
9926 if (!PyUnicode_Check(separator)) {
9927 PyErr_Format(PyExc_TypeError,
9928 "separator: expected str instance,"
9929 " %.80s found",
9930 Py_TYPE(separator)->tp_name);
9931 goto onError;
9932 }
9933 if (PyUnicode_READY(separator))
9934 goto onError;
9935 sep = separator;
9936 seplen = PyUnicode_GET_LENGTH(separator);
9937 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9938 /* inc refcount to keep this code path symmetric with the
9939 above case of a blank separator */
9940 Py_INCREF(sep);
9941 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009942 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009943 }
9944
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009945 /* There are at least two things to join, or else we have a subclass
9946 * of str in the sequence.
9947 * Do a pre-pass to figure out the total amount of space we'll
9948 * need (sz), and see whether all argument are strings.
9949 */
9950 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009951#ifdef Py_DEBUG
9952 use_memcpy = 0;
9953#else
9954 use_memcpy = 1;
9955#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009956 for (i = 0; i < seqlen; i++) {
9957 const Py_ssize_t old_sz = sz;
9958 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 if (!PyUnicode_Check(item)) {
9960 PyErr_Format(PyExc_TypeError,
9961 "sequence item %zd: expected str instance,"
9962 " %.80s found",
9963 i, Py_TYPE(item)->tp_name);
9964 goto onError;
9965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 if (PyUnicode_READY(item) == -1)
9967 goto onError;
9968 sz += PyUnicode_GET_LENGTH(item);
9969 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009970 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009971 if (i != 0)
9972 sz += seplen;
9973 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9974 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009975 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009976 goto onError;
9977 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009978 if (use_memcpy && last_obj != NULL) {
9979 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9980 use_memcpy = 0;
9981 }
9982 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009983 }
Tim Petersced69f82003-09-16 20:30:58 +00009984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009986 if (res == NULL)
9987 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009988
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009989 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009990#ifdef Py_DEBUG
9991 use_memcpy = 0;
9992#else
9993 if (use_memcpy) {
9994 res_data = PyUnicode_1BYTE_DATA(res);
9995 kind = PyUnicode_KIND(res);
9996 if (seplen != 0)
9997 sep_data = PyUnicode_1BYTE_DATA(sep);
9998 }
9999#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010001 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010002 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +020010004 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010005 if (use_memcpy) {
10006 Py_MEMCPY(res_data,
10007 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010008 kind * seplen);
10009 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010010 }
10011 else {
10012 copy_characters(res, res_offset, sep, 0, seplen);
10013 res_offset += seplen;
10014 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010015 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010016 itemlen = PyUnicode_GET_LENGTH(item);
10017 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010018 if (use_memcpy) {
10019 Py_MEMCPY(res_data,
10020 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010021 kind * itemlen);
10022 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010023 }
10024 else {
10025 copy_characters(res, res_offset, item, 0, itemlen);
10026 res_offset += itemlen;
10027 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010028 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010029 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010030 if (use_memcpy)
10031 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010032 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010033 else
10034 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010035
Tim Peters05eba1f2004-08-27 21:32:02 +000010036 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010038 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040
Benjamin Peterson29060642009-01-31 22:14:21 +000010041 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010042 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010044 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045 return NULL;
10046}
10047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048#define FILL(kind, data, value, start, length) \
10049 do { \
10050 Py_ssize_t i_ = 0; \
10051 assert(kind != PyUnicode_WCHAR_KIND); \
10052 switch ((kind)) { \
10053 case PyUnicode_1BYTE_KIND: { \
10054 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10055 memset(to_, (unsigned char)value, length); \
10056 break; \
10057 } \
10058 case PyUnicode_2BYTE_KIND: { \
10059 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10060 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10061 break; \
10062 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010063 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10065 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10066 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010067 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 } \
10069 } \
10070 } while (0)
10071
Victor Stinner3fe55312012-01-04 00:33:50 +010010072Py_ssize_t
10073PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10074 Py_UCS4 fill_char)
10075{
10076 Py_ssize_t maxlen;
10077 enum PyUnicode_Kind kind;
10078 void *data;
10079
10080 if (!PyUnicode_Check(unicode)) {
10081 PyErr_BadInternalCall();
10082 return -1;
10083 }
10084 if (PyUnicode_READY(unicode) == -1)
10085 return -1;
10086 if (unicode_check_modifiable(unicode))
10087 return -1;
10088
10089 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10090 PyErr_SetString(PyExc_ValueError,
10091 "fill character is bigger than "
10092 "the string maximum character");
10093 return -1;
10094 }
10095
10096 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10097 length = Py_MIN(maxlen, length);
10098 if (length <= 0)
10099 return 0;
10100
10101 kind = PyUnicode_KIND(unicode);
10102 data = PyUnicode_DATA(unicode);
10103 FILL(kind, data, fill_char, start, length);
10104 return length;
10105}
10106
Victor Stinner9310abb2011-10-05 00:59:23 +020010107static PyObject *
10108pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010109 Py_ssize_t left,
10110 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 PyObject *u;
10114 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010115 int kind;
10116 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117
10118 if (left < 0)
10119 left = 0;
10120 if (right < 0)
10121 right = 0;
10122
Victor Stinnerc4b49542011-12-11 22:44:26 +010010123 if (left == 0 && right == 0)
10124 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10127 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010128 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10129 return NULL;
10130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10132 if (fill > maxchar)
10133 maxchar = fill;
10134 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010135 if (!u)
10136 return NULL;
10137
10138 kind = PyUnicode_KIND(u);
10139 data = PyUnicode_DATA(u);
10140 if (left)
10141 FILL(kind, data, fill, 0, left);
10142 if (right)
10143 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010144 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010145 assert(_PyUnicode_CheckConsistency(u, 1));
10146 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147}
10148
Alexander Belopolsky40018472011-02-26 01:02:56 +000010149PyObject *
10150PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153
10154 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010155 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010156 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010157 if (PyUnicode_READY(string) == -1) {
10158 Py_DECREF(string);
10159 return NULL;
10160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161
Benjamin Petersonead6b532011-12-20 17:23:42 -060010162 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 if (PyUnicode_IS_ASCII(string))
10165 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010166 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010167 PyUnicode_GET_LENGTH(string), keepends);
10168 else
10169 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010170 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010171 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 break;
10173 case PyUnicode_2BYTE_KIND:
10174 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010175 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 PyUnicode_GET_LENGTH(string), keepends);
10177 break;
10178 case PyUnicode_4BYTE_KIND:
10179 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010180 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 PyUnicode_GET_LENGTH(string), keepends);
10182 break;
10183 default:
10184 assert(0);
10185 list = 0;
10186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187 Py_DECREF(string);
10188 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189}
10190
Alexander Belopolsky40018472011-02-26 01:02:56 +000010191static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010192split(PyObject *self,
10193 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010194 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 int kind1, kind2, kind;
10197 void *buf1, *buf2;
10198 Py_ssize_t len1, len2;
10199 PyObject* out;
10200
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010202 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 if (PyUnicode_READY(self) == -1)
10205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010208 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 if (PyUnicode_IS_ASCII(self))
10211 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 PyUnicode_GET_LENGTH(self), maxcount
10214 );
10215 else
10216 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 PyUnicode_GET_LENGTH(self), maxcount
10219 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 case PyUnicode_2BYTE_KIND:
10221 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010222 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 PyUnicode_GET_LENGTH(self), maxcount
10224 );
10225 case PyUnicode_4BYTE_KIND:
10226 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010227 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 PyUnicode_GET_LENGTH(self), maxcount
10229 );
10230 default:
10231 assert(0);
10232 return NULL;
10233 }
10234
10235 if (PyUnicode_READY(substring) == -1)
10236 return NULL;
10237
10238 kind1 = PyUnicode_KIND(self);
10239 kind2 = PyUnicode_KIND(substring);
10240 kind = kind1 > kind2 ? kind1 : kind2;
10241 buf1 = PyUnicode_DATA(self);
10242 buf2 = PyUnicode_DATA(substring);
10243 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010244 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 if (!buf1)
10246 return NULL;
10247 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010248 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (!buf2) {
10250 if (kind1 != kind) PyMem_Free(buf1);
10251 return NULL;
10252 }
10253 len1 = PyUnicode_GET_LENGTH(self);
10254 len2 = PyUnicode_GET_LENGTH(substring);
10255
Benjamin Petersonead6b532011-12-20 17:23:42 -060010256 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10259 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010260 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010261 else
10262 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010263 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 break;
10265 case PyUnicode_2BYTE_KIND:
10266 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 break;
10269 case PyUnicode_4BYTE_KIND:
10270 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010271 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 break;
10273 default:
10274 out = NULL;
10275 }
10276 if (kind1 != kind)
10277 PyMem_Free(buf1);
10278 if (kind2 != kind)
10279 PyMem_Free(buf2);
10280 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281}
10282
Alexander Belopolsky40018472011-02-26 01:02:56 +000010283static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010284rsplit(PyObject *self,
10285 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010286 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 int kind1, kind2, kind;
10289 void *buf1, *buf2;
10290 Py_ssize_t len1, len2;
10291 PyObject* out;
10292
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010293 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010294 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 if (PyUnicode_READY(self) == -1)
10297 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010300 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010302 if (PyUnicode_IS_ASCII(self))
10303 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010305 PyUnicode_GET_LENGTH(self), maxcount
10306 );
10307 else
10308 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010310 PyUnicode_GET_LENGTH(self), maxcount
10311 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 case PyUnicode_2BYTE_KIND:
10313 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010314 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 PyUnicode_GET_LENGTH(self), maxcount
10316 );
10317 case PyUnicode_4BYTE_KIND:
10318 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 PyUnicode_GET_LENGTH(self), maxcount
10321 );
10322 default:
10323 assert(0);
10324 return NULL;
10325 }
10326
10327 if (PyUnicode_READY(substring) == -1)
10328 return NULL;
10329
10330 kind1 = PyUnicode_KIND(self);
10331 kind2 = PyUnicode_KIND(substring);
10332 kind = kind1 > kind2 ? kind1 : kind2;
10333 buf1 = PyUnicode_DATA(self);
10334 buf2 = PyUnicode_DATA(substring);
10335 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 if (!buf1)
10338 return NULL;
10339 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (!buf2) {
10342 if (kind1 != kind) PyMem_Free(buf1);
10343 return NULL;
10344 }
10345 len1 = PyUnicode_GET_LENGTH(self);
10346 len2 = PyUnicode_GET_LENGTH(substring);
10347
Benjamin Petersonead6b532011-12-20 17:23:42 -060010348 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10351 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010352 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010353 else
10354 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 break;
10357 case PyUnicode_2BYTE_KIND:
10358 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010359 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 break;
10361 case PyUnicode_4BYTE_KIND:
10362 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010363 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 break;
10365 default:
10366 out = NULL;
10367 }
10368 if (kind1 != kind)
10369 PyMem_Free(buf1);
10370 if (kind2 != kind)
10371 PyMem_Free(buf2);
10372 return out;
10373}
10374
10375static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010376anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10377 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010379 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010381 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10382 return asciilib_find(buf1, len1, buf2, len2, offset);
10383 else
10384 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 case PyUnicode_2BYTE_KIND:
10386 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10387 case PyUnicode_4BYTE_KIND:
10388 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10389 }
10390 assert(0);
10391 return -1;
10392}
10393
10394static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010395anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10396 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010398 switch (kind) {
10399 case PyUnicode_1BYTE_KIND:
10400 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10401 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10402 else
10403 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10404 case PyUnicode_2BYTE_KIND:
10405 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10406 case PyUnicode_4BYTE_KIND:
10407 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10408 }
10409 assert(0);
10410 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010411}
10412
Alexander Belopolsky40018472011-02-26 01:02:56 +000010413static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414replace(PyObject *self, PyObject *str1,
10415 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 PyObject *u;
10418 char *sbuf = PyUnicode_DATA(self);
10419 char *buf1 = PyUnicode_DATA(str1);
10420 char *buf2 = PyUnicode_DATA(str2);
10421 int srelease = 0, release1 = 0, release2 = 0;
10422 int skind = PyUnicode_KIND(self);
10423 int kind1 = PyUnicode_KIND(str1);
10424 int kind2 = PyUnicode_KIND(str2);
10425 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10426 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10427 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010428 int mayshrink;
10429 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430
10431 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010432 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010434 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435
Victor Stinner59de0ee2011-10-07 10:01:28 +020010436 if (str1 == str2)
10437 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 if (skind < kind1)
10439 /* substring too wide to be present */
10440 goto nothing;
10441
Victor Stinner49a0a212011-10-12 23:46:10 +020010442 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10443 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10444 /* Replacing str1 with str2 may cause a maxchar reduction in the
10445 result string. */
10446 mayshrink = (maxchar_str2 < maxchar);
10447 maxchar = Py_MAX(maxchar, maxchar_str2);
10448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010452 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010455 Py_UCS4 u1, u2;
10456 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010457 Py_ssize_t index, pos;
10458 char *src;
10459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010461 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10462 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010466 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010468 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010470
10471 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10472 index = 0;
10473 src = sbuf;
10474 while (--maxcount)
10475 {
10476 pos++;
10477 src += pos * PyUnicode_KIND(self);
10478 slen -= pos;
10479 index += pos;
10480 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10481 if (pos < 0)
10482 break;
10483 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10484 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010485 }
10486 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 int rkind = skind;
10488 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010489 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 if (kind1 < rkind) {
10492 /* widen substring */
10493 buf1 = _PyUnicode_AsKind(str1, rkind);
10494 if (!buf1) goto error;
10495 release1 = 1;
10496 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010497 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010498 if (i < 0)
10499 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 if (rkind > kind2) {
10501 /* widen replacement */
10502 buf2 = _PyUnicode_AsKind(str2, rkind);
10503 if (!buf2) goto error;
10504 release2 = 1;
10505 }
10506 else if (rkind < kind2) {
10507 /* widen self and buf1 */
10508 rkind = kind2;
10509 if (release1) PyMem_Free(buf1);
10510 sbuf = _PyUnicode_AsKind(self, rkind);
10511 if (!sbuf) goto error;
10512 srelease = 1;
10513 buf1 = _PyUnicode_AsKind(str1, rkind);
10514 if (!buf1) goto error;
10515 release1 = 1;
10516 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 u = PyUnicode_New(slen, maxchar);
10518 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010520 assert(PyUnicode_KIND(u) == rkind);
10521 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010522
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010524 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010525 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010527 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010529
10530 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010531 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010532 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010533 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010534 if (i == -1)
10535 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010536 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010538 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010542 }
10543 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 Py_ssize_t n, i, j, ires;
10545 Py_ssize_t product, new_size;
10546 int rkind = skind;
10547 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010550 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 buf1 = _PyUnicode_AsKind(str1, rkind);
10552 if (!buf1) goto error;
10553 release1 = 1;
10554 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010556 if (n == 0)
10557 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010559 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 buf2 = _PyUnicode_AsKind(str2, rkind);
10561 if (!buf2) goto error;
10562 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010565 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 rkind = kind2;
10567 sbuf = _PyUnicode_AsKind(self, rkind);
10568 if (!sbuf) goto error;
10569 srelease = 1;
10570 if (release1) PyMem_Free(buf1);
10571 buf1 = _PyUnicode_AsKind(str1, rkind);
10572 if (!buf1) goto error;
10573 release1 = 1;
10574 }
10575 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10576 PyUnicode_GET_LENGTH(str1))); */
10577 product = n * (len2-len1);
10578 if ((product / (len2-len1)) != n) {
10579 PyErr_SetString(PyExc_OverflowError,
10580 "replace string is too long");
10581 goto error;
10582 }
10583 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010584 if (new_size == 0) {
10585 Py_INCREF(unicode_empty);
10586 u = unicode_empty;
10587 goto done;
10588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10590 PyErr_SetString(PyExc_OverflowError,
10591 "replace string is too long");
10592 goto error;
10593 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010594 u = PyUnicode_New(new_size, maxchar);
10595 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 assert(PyUnicode_KIND(u) == rkind);
10598 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 ires = i = 0;
10600 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010601 while (n-- > 0) {
10602 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010603 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010604 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010605 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010606 if (j == -1)
10607 break;
10608 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010610 memcpy(res + rkind * ires,
10611 sbuf + rkind * i,
10612 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614 }
10615 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010617 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010619 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010626 memcpy(res + rkind * ires,
10627 sbuf + rkind * i,
10628 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010629 }
10630 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010631 /* interleave */
10632 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010633 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010635 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 if (--n <= 0)
10638 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010639 memcpy(res + rkind * ires,
10640 sbuf + rkind * i,
10641 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 ires++;
10643 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010645 memcpy(res + rkind * ires,
10646 sbuf + rkind * i,
10647 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010649 }
10650
10651 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010652 unicode_adjust_maxchar(&u);
10653 if (u == NULL)
10654 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010656
10657 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (srelease)
10659 PyMem_FREE(sbuf);
10660 if (release1)
10661 PyMem_FREE(buf1);
10662 if (release2)
10663 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010664 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 if (srelease)
10670 PyMem_FREE(sbuf);
10671 if (release1)
10672 PyMem_FREE(buf1);
10673 if (release2)
10674 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010675 return unicode_result_unchanged(self);
10676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 error:
10678 if (srelease && sbuf)
10679 PyMem_FREE(sbuf);
10680 if (release1 && buf1)
10681 PyMem_FREE(buf1);
10682 if (release2 && buf2)
10683 PyMem_FREE(buf2);
10684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685}
10686
10687/* --- Unicode Object Methods --------------------------------------------- */
10688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010689PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691\n\
10692Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010693characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694
10695static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010696unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010698 if (PyUnicode_READY(self) == -1)
10699 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010700 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701}
10702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010703PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010704 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705\n\
10706Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010707have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708
10709static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010710unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010712 if (PyUnicode_READY(self) == -1)
10713 return NULL;
10714 if (PyUnicode_GET_LENGTH(self) == 0)
10715 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010716 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717}
10718
Benjamin Petersond5890c82012-01-14 13:23:30 -050010719PyDoc_STRVAR(casefold__doc__,
10720 "S.casefold() -> str\n\
10721\n\
10722Return a version of S suitable for caseless comparisons.");
10723
10724static PyObject *
10725unicode_casefold(PyObject *self)
10726{
10727 if (PyUnicode_READY(self) == -1)
10728 return NULL;
10729 if (PyUnicode_IS_ASCII(self))
10730 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010731 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010732}
10733
10734
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010735/* Argument converter. Coerces to a single unicode character */
10736
10737static int
10738convert_uc(PyObject *obj, void *addr)
10739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010741 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010742
Benjamin Peterson14339b62009-01-31 16:36:08 +000010743 uniobj = PyUnicode_FromObject(obj);
10744 if (uniobj == NULL) {
10745 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010747 return 0;
10748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010750 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010751 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010752 Py_DECREF(uniobj);
10753 return 0;
10754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010756 Py_DECREF(uniobj);
10757 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010758}
10759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010760PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010763Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010764done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765
10766static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010767unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010769 Py_ssize_t marg, left;
10770 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 Py_UCS4 fillchar = ' ';
10772
Victor Stinnere9a29352011-10-01 02:14:59 +020010773 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775
Benjamin Petersonbac79492012-01-14 13:34:47 -050010776 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777 return NULL;
10778
Victor Stinnerc4b49542011-12-11 22:44:26 +010010779 if (PyUnicode_GET_LENGTH(self) >= width)
10780 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
Victor Stinnerc4b49542011-12-11 22:44:26 +010010782 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783 left = marg / 2 + (marg & width & 1);
10784
Victor Stinner9310abb2011-10-05 00:59:23 +020010785 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786}
10787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788/* This function assumes that str1 and str2 are readied by the caller. */
10789
Marc-André Lemburge5034372000-08-08 08:04:29 +000010790static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010791unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 int kind1, kind2;
10794 void *data1, *data2;
10795 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 kind1 = PyUnicode_KIND(str1);
10798 kind2 = PyUnicode_KIND(str2);
10799 data1 = PyUnicode_DATA(str1);
10800 data2 = PyUnicode_DATA(str2);
10801 len1 = PyUnicode_GET_LENGTH(str1);
10802 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 for (i = 0; i < len1 && i < len2; ++i) {
10805 Py_UCS4 c1, c2;
10806 c1 = PyUnicode_READ(kind1, data1, i);
10807 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010808
10809 if (c1 != c2)
10810 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010811 }
10812
10813 return (len1 < len2) ? -1 : (len1 != len2);
10814}
10815
Alexander Belopolsky40018472011-02-26 01:02:56 +000010816int
10817PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10820 if (PyUnicode_READY(left) == -1 ||
10821 PyUnicode_READY(right) == -1)
10822 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010823 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010825 PyErr_Format(PyExc_TypeError,
10826 "Can't compare %.100s and %.100s",
10827 left->ob_type->tp_name,
10828 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829 return -1;
10830}
10831
Martin v. Löwis5b222132007-06-10 09:51:05 +000010832int
10833PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10834{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 Py_ssize_t i;
10836 int kind;
10837 void *data;
10838 Py_UCS4 chr;
10839
Victor Stinner910337b2011-10-03 03:20:16 +020010840 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (PyUnicode_READY(uni) == -1)
10842 return -1;
10843 kind = PyUnicode_KIND(uni);
10844 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010845 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10847 if (chr != str[i])
10848 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010849 /* This check keeps Python strings that end in '\0' from comparing equal
10850 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010852 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010853 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010854 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010855 return 0;
10856}
10857
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010858
Benjamin Peterson29060642009-01-31 22:14:21 +000010859#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010860 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010861
Alexander Belopolsky40018472011-02-26 01:02:56 +000010862PyObject *
10863PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010864{
10865 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010866
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010867 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10868 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 if (PyUnicode_READY(left) == -1 ||
10870 PyUnicode_READY(right) == -1)
10871 return NULL;
10872 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10873 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010874 if (op == Py_EQ) {
10875 Py_INCREF(Py_False);
10876 return Py_False;
10877 }
10878 if (op == Py_NE) {
10879 Py_INCREF(Py_True);
10880 return Py_True;
10881 }
10882 }
10883 if (left == right)
10884 result = 0;
10885 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010886 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010887
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010888 /* Convert the return value to a Boolean */
10889 switch (op) {
10890 case Py_EQ:
10891 v = TEST_COND(result == 0);
10892 break;
10893 case Py_NE:
10894 v = TEST_COND(result != 0);
10895 break;
10896 case Py_LE:
10897 v = TEST_COND(result <= 0);
10898 break;
10899 case Py_GE:
10900 v = TEST_COND(result >= 0);
10901 break;
10902 case Py_LT:
10903 v = TEST_COND(result == -1);
10904 break;
10905 case Py_GT:
10906 v = TEST_COND(result == 1);
10907 break;
10908 default:
10909 PyErr_BadArgument();
10910 return NULL;
10911 }
10912 Py_INCREF(v);
10913 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010914 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010915
Brian Curtindfc80e32011-08-10 20:28:54 -050010916 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010917}
10918
Alexander Belopolsky40018472011-02-26 01:02:56 +000010919int
10920PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010921{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 int kind1, kind2, kind;
10924 void *buf1, *buf2;
10925 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010926 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010927
10928 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929 sub = PyUnicode_FromObject(element);
10930 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010931 PyErr_Format(PyExc_TypeError,
10932 "'in <string>' requires string as left operand, not %s",
10933 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010935 }
10936
Thomas Wouters477c8d52006-05-27 19:21:47 +000010937 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010938 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010939 Py_DECREF(sub);
10940 return -1;
10941 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010942 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10943 Py_DECREF(sub);
10944 Py_DECREF(str);
10945 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 kind1 = PyUnicode_KIND(str);
10948 kind2 = PyUnicode_KIND(sub);
10949 kind = kind1 > kind2 ? kind1 : kind2;
10950 buf1 = PyUnicode_DATA(str);
10951 buf2 = PyUnicode_DATA(sub);
10952 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010953 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 if (!buf1) {
10955 Py_DECREF(sub);
10956 return -1;
10957 }
10958 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010959 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 if (!buf2) {
10961 Py_DECREF(sub);
10962 if (kind1 != kind) PyMem_Free(buf1);
10963 return -1;
10964 }
10965 len1 = PyUnicode_GET_LENGTH(str);
10966 len2 = PyUnicode_GET_LENGTH(sub);
10967
Benjamin Petersonead6b532011-12-20 17:23:42 -060010968 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 case PyUnicode_1BYTE_KIND:
10970 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10971 break;
10972 case PyUnicode_2BYTE_KIND:
10973 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10974 break;
10975 case PyUnicode_4BYTE_KIND:
10976 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10977 break;
10978 default:
10979 result = -1;
10980 assert(0);
10981 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010982
10983 Py_DECREF(str);
10984 Py_DECREF(sub);
10985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 if (kind1 != kind)
10987 PyMem_Free(buf1);
10988 if (kind2 != kind)
10989 PyMem_Free(buf2);
10990
Guido van Rossum403d68b2000-03-13 15:55:09 +000010991 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010992}
10993
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994/* Concat to string or Unicode object giving a new Unicode object. */
10995
Alexander Belopolsky40018472011-02-26 01:02:56 +000010996PyObject *
10997PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011000 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011001 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
11003 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011006 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010
11011 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011012 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011013 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011016 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011017 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 }
11020
Victor Stinner488fa492011-12-12 00:01:39 +010011021 u_len = PyUnicode_GET_LENGTH(u);
11022 v_len = PyUnicode_GET_LENGTH(v);
11023 if (u_len > PY_SSIZE_T_MAX - v_len) {
11024 PyErr_SetString(PyExc_OverflowError,
11025 "strings are too large to concat");
11026 goto onError;
11027 }
11028 new_len = u_len + v_len;
11029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011031 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11032 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011035 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011037 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011038 copy_characters(w, 0, u, 0, u_len);
11039 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040 Py_DECREF(u);
11041 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011042 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011043 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044
Benjamin Peterson29060642009-01-31 22:14:21 +000011045 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046 Py_XDECREF(u);
11047 Py_XDECREF(v);
11048 return NULL;
11049}
11050
Walter Dörwald1ab83302007-05-18 17:15:44 +000011051void
Victor Stinner23e56682011-10-03 03:54:37 +020011052PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011053{
Victor Stinner23e56682011-10-03 03:54:37 +020011054 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011055 Py_UCS4 maxchar, maxchar2;
11056 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011057
11058 if (p_left == NULL) {
11059 if (!PyErr_Occurred())
11060 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011061 return;
11062 }
Victor Stinner23e56682011-10-03 03:54:37 +020011063 left = *p_left;
11064 if (right == NULL || !PyUnicode_Check(left)) {
11065 if (!PyErr_Occurred())
11066 PyErr_BadInternalCall();
11067 goto error;
11068 }
11069
Benjamin Petersonbac79492012-01-14 13:34:47 -050011070 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011071 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011072 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011073 goto error;
11074
Victor Stinner488fa492011-12-12 00:01:39 +010011075 /* Shortcuts */
11076 if (left == unicode_empty) {
11077 Py_DECREF(left);
11078 Py_INCREF(right);
11079 *p_left = right;
11080 return;
11081 }
11082 if (right == unicode_empty)
11083 return;
11084
11085 left_len = PyUnicode_GET_LENGTH(left);
11086 right_len = PyUnicode_GET_LENGTH(right);
11087 if (left_len > PY_SSIZE_T_MAX - right_len) {
11088 PyErr_SetString(PyExc_OverflowError,
11089 "strings are too large to concat");
11090 goto error;
11091 }
11092 new_len = left_len + right_len;
11093
11094 if (unicode_modifiable(left)
11095 && PyUnicode_CheckExact(right)
11096 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011097 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11098 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011099 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011100 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011101 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11102 {
11103 /* append inplace */
11104 if (unicode_resize(p_left, new_len) != 0) {
11105 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11106 * deallocated so it cannot be put back into
11107 * 'variable'. The MemoryError is raised when there
11108 * is no value in 'variable', which might (very
11109 * remotely) be a cause of incompatibilities.
11110 */
11111 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011112 }
Victor Stinner488fa492011-12-12 00:01:39 +010011113 /* copy 'right' into the newly allocated area of 'left' */
11114 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011115 }
Victor Stinner488fa492011-12-12 00:01:39 +010011116 else {
11117 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11118 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11119 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011120
Victor Stinner488fa492011-12-12 00:01:39 +010011121 /* Concat the two Unicode strings */
11122 res = PyUnicode_New(new_len, maxchar);
11123 if (res == NULL)
11124 goto error;
11125 copy_characters(res, 0, left, 0, left_len);
11126 copy_characters(res, left_len, right, 0, right_len);
11127 Py_DECREF(left);
11128 *p_left = res;
11129 }
11130 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011131 return;
11132
11133error:
Victor Stinner488fa492011-12-12 00:01:39 +010011134 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011135}
11136
11137void
11138PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11139{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011140 PyUnicode_Append(pleft, right);
11141 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011142}
11143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011144PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011148string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011149interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150
11151static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011152unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011154 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011155 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011156 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 int kind1, kind2, kind;
11159 void *buf1, *buf2;
11160 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
Jesus Ceaac451502011-04-20 17:09:23 +020011162 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11163 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011164 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 kind1 = PyUnicode_KIND(self);
11167 kind2 = PyUnicode_KIND(substring);
11168 kind = kind1 > kind2 ? kind1 : kind2;
11169 buf1 = PyUnicode_DATA(self);
11170 buf2 = PyUnicode_DATA(substring);
11171 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011172 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 if (!buf1) {
11174 Py_DECREF(substring);
11175 return NULL;
11176 }
11177 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011178 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 if (!buf2) {
11180 Py_DECREF(substring);
11181 if (kind1 != kind) PyMem_Free(buf1);
11182 return NULL;
11183 }
11184 len1 = PyUnicode_GET_LENGTH(self);
11185 len2 = PyUnicode_GET_LENGTH(substring);
11186
11187 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011188 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 case PyUnicode_1BYTE_KIND:
11190 iresult = ucs1lib_count(
11191 ((Py_UCS1*)buf1) + start, end - start,
11192 buf2, len2, PY_SSIZE_T_MAX
11193 );
11194 break;
11195 case PyUnicode_2BYTE_KIND:
11196 iresult = ucs2lib_count(
11197 ((Py_UCS2*)buf1) + start, end - start,
11198 buf2, len2, PY_SSIZE_T_MAX
11199 );
11200 break;
11201 case PyUnicode_4BYTE_KIND:
11202 iresult = ucs4lib_count(
11203 ((Py_UCS4*)buf1) + start, end - start,
11204 buf2, len2, PY_SSIZE_T_MAX
11205 );
11206 break;
11207 default:
11208 assert(0); iresult = 0;
11209 }
11210
11211 result = PyLong_FromSsize_t(iresult);
11212
11213 if (kind1 != kind)
11214 PyMem_Free(buf1);
11215 if (kind2 != kind)
11216 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217
11218 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011219
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220 return result;
11221}
11222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011223PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011224 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011226Encode S using the codec registered for encoding. Default encoding\n\
11227is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011228handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011229a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11230'xmlcharrefreplace' as well as any other name registered with\n\
11231codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
11233static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011234unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011236 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237 char *encoding = NULL;
11238 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011239
Benjamin Peterson308d6372009-09-18 21:42:35 +000011240 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11241 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011243 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011244}
11245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011246PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011247 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248\n\
11249Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011250If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
11252static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011253unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011255 Py_ssize_t i, j, line_pos, src_len, incr;
11256 Py_UCS4 ch;
11257 PyObject *u;
11258 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011260 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011261 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
11263 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
Antoine Pitrou22425222011-10-04 19:10:51 +020011266 if (PyUnicode_READY(self) == -1)
11267 return NULL;
11268
Thomas Wouters7e474022000-07-16 12:04:32 +000011269 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011270 src_len = PyUnicode_GET_LENGTH(self);
11271 i = j = line_pos = 0;
11272 kind = PyUnicode_KIND(self);
11273 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011274 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011275 for (; i < src_len; i++) {
11276 ch = PyUnicode_READ(kind, src_data, i);
11277 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011278 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011280 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 goto overflow;
11283 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011285 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011289 goto overflow;
11290 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011292 if (ch == '\n' || ch == '\r')
11293 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011295 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011296 if (!found)
11297 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011298
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301 if (!u)
11302 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011303 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304
Antoine Pitroue71d5742011-10-04 15:55:09 +020011305 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
Antoine Pitroue71d5742011-10-04 15:55:09 +020011307 for (; i < src_len; i++) {
11308 ch = PyUnicode_READ(kind, src_data, i);
11309 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011311 incr = tabsize - (line_pos % tabsize);
11312 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011313 FILL(kind, dest_data, ' ', j, incr);
11314 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011315 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011316 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011318 line_pos++;
11319 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011320 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011321 if (ch == '\n' || ch == '\r')
11322 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011324 }
11325 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011326 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011327
Antoine Pitroue71d5742011-10-04 15:55:09 +020011328 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011329 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11330 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331}
11332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011333PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011334 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335\n\
11336Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011337such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338arguments start and end are interpreted as in slice notation.\n\
11339\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011340Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
11342static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011345 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011346 Py_ssize_t start;
11347 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011348 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349
Jesus Ceaac451502011-04-20 17:09:23 +020011350 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11351 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 if (PyUnicode_READY(self) == -1)
11355 return NULL;
11356 if (PyUnicode_READY(substring) == -1)
11357 return NULL;
11358
Victor Stinner7931d9a2011-11-04 00:22:48 +010011359 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
11361 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 if (result == -2)
11364 return NULL;
11365
Christian Heimes217cfd12007-12-02 14:31:20 +000011366 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367}
11368
11369static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011370unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011372 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11373 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376}
11377
Guido van Rossumc2504932007-09-18 19:42:40 +000011378/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011379 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011380static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011381unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382{
Guido van Rossumc2504932007-09-18 19:42:40 +000011383 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011384 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011385
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011386#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011387 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011388#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 if (_PyUnicode_HASH(self) != -1)
11390 return _PyUnicode_HASH(self);
11391 if (PyUnicode_READY(self) == -1)
11392 return -1;
11393 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011394 /*
11395 We make the hash of the empty string be 0, rather than using
11396 (prefix ^ suffix), since this slightly obfuscates the hash secret
11397 */
11398 if (len == 0) {
11399 _PyUnicode_HASH(self) = 0;
11400 return 0;
11401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402
11403 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011404#define HASH(P) \
11405 x ^= (Py_uhash_t) *P << 7; \
11406 while (--len >= 0) \
11407 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408
Georg Brandl2fb477c2012-02-21 00:33:36 +010011409 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 switch (PyUnicode_KIND(self)) {
11411 case PyUnicode_1BYTE_KIND: {
11412 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11413 HASH(c);
11414 break;
11415 }
11416 case PyUnicode_2BYTE_KIND: {
11417 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11418 HASH(s);
11419 break;
11420 }
11421 default: {
11422 Py_UCS4 *l;
11423 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11424 "Impossible switch case in unicode_hash");
11425 l = PyUnicode_4BYTE_DATA(self);
11426 HASH(l);
11427 break;
11428 }
11429 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011430 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11431 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432
Guido van Rossumc2504932007-09-18 19:42:40 +000011433 if (x == -1)
11434 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011436 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011440PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
11445static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011448 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011449 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011450 Py_ssize_t start;
11451 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452
Jesus Ceaac451502011-04-20 17:09:23 +020011453 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11454 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 if (PyUnicode_READY(self) == -1)
11458 return NULL;
11459 if (PyUnicode_READY(substring) == -1)
11460 return NULL;
11461
Victor Stinner7931d9a2011-11-04 00:22:48 +010011462 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
11464 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 if (result == -2)
11467 return NULL;
11468
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469 if (result < 0) {
11470 PyErr_SetString(PyExc_ValueError, "substring not found");
11471 return NULL;
11472 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011473
Christian Heimes217cfd12007-12-02 14:31:20 +000011474 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475}
11476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011477PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011480Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011481at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
11483static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011484unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 Py_ssize_t i, length;
11487 int kind;
11488 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 int cased;
11490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 if (PyUnicode_READY(self) == -1)
11492 return NULL;
11493 length = PyUnicode_GET_LENGTH(self);
11494 kind = PyUnicode_KIND(self);
11495 data = PyUnicode_DATA(self);
11496
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 if (length == 1)
11499 return PyBool_FromLong(
11500 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011502 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011505
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 for (i = 0; i < length; i++) {
11508 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011509
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11511 return PyBool_FromLong(0);
11512 else if (!cased && Py_UNICODE_ISLOWER(ch))
11513 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011515 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516}
11517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011518PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011521Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011522at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523
11524static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011525unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 Py_ssize_t i, length;
11528 int kind;
11529 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530 int cased;
11531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 if (PyUnicode_READY(self) == -1)
11533 return NULL;
11534 length = PyUnicode_GET_LENGTH(self);
11535 kind = PyUnicode_KIND(self);
11536 data = PyUnicode_DATA(self);
11537
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 if (length == 1)
11540 return PyBool_FromLong(
11541 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011543 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011546
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 for (i = 0; i < length; i++) {
11549 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011550
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11552 return PyBool_FromLong(0);
11553 else if (!cased && Py_UNICODE_ISUPPER(ch))
11554 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011556 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557}
11558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011559PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011562Return True if S is a titlecased string and there is at least one\n\
11563character in S, i.e. upper- and titlecase characters may only\n\
11564follow uncased characters and lowercase characters only cased ones.\n\
11565Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566
11567static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011568unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 Py_ssize_t i, length;
11571 int kind;
11572 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 int cased, previous_is_cased;
11574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 if (PyUnicode_READY(self) == -1)
11576 return NULL;
11577 length = PyUnicode_GET_LENGTH(self);
11578 kind = PyUnicode_KIND(self);
11579 data = PyUnicode_DATA(self);
11580
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582 if (length == 1) {
11583 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11584 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11585 (Py_UNICODE_ISUPPER(ch) != 0));
11586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011588 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011591
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592 cased = 0;
11593 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 for (i = 0; i < length; i++) {
11595 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011596
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11598 if (previous_is_cased)
11599 return PyBool_FromLong(0);
11600 previous_is_cased = 1;
11601 cased = 1;
11602 }
11603 else if (Py_UNICODE_ISLOWER(ch)) {
11604 if (!previous_is_cased)
11605 return PyBool_FromLong(0);
11606 previous_is_cased = 1;
11607 cased = 1;
11608 }
11609 else
11610 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011612 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613}
11614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011615PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011616 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011618Return True if all characters in S are whitespace\n\
11619and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620
11621static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011622unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 Py_ssize_t i, length;
11625 int kind;
11626 void *data;
11627
11628 if (PyUnicode_READY(self) == -1)
11629 return NULL;
11630 length = PyUnicode_GET_LENGTH(self);
11631 kind = PyUnicode_KIND(self);
11632 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 if (length == 1)
11636 return PyBool_FromLong(
11637 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011639 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011641 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 for (i = 0; i < length; i++) {
11644 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011645 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011648 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649}
11650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011651PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011653\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011654Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011655and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011656
11657static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011658unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011659{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 Py_ssize_t i, length;
11661 int kind;
11662 void *data;
11663
11664 if (PyUnicode_READY(self) == -1)
11665 return NULL;
11666 length = PyUnicode_GET_LENGTH(self);
11667 kind = PyUnicode_KIND(self);
11668 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011669
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011670 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 if (length == 1)
11672 return PyBool_FromLong(
11673 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011674
11675 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 for (i = 0; i < length; i++) {
11680 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011681 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011682 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011683 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011684}
11685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011686PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011687 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011688\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011689Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011690and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011691
11692static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011693unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011694{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 int kind;
11696 void *data;
11697 Py_ssize_t len, i;
11698
11699 if (PyUnicode_READY(self) == -1)
11700 return NULL;
11701
11702 kind = PyUnicode_KIND(self);
11703 data = PyUnicode_DATA(self);
11704 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011705
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011706 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 if (len == 1) {
11708 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11709 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11710 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011711
11712 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011714 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 for (i = 0; i < len; i++) {
11717 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011718 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011721 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011722}
11723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011724PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011727Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011728False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
11730static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011731unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 Py_ssize_t i, length;
11734 int kind;
11735 void *data;
11736
11737 if (PyUnicode_READY(self) == -1)
11738 return NULL;
11739 length = PyUnicode_GET_LENGTH(self);
11740 kind = PyUnicode_KIND(self);
11741 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (length == 1)
11745 return PyBool_FromLong(
11746 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011748 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 for (i = 0; i < length; i++) {
11753 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011756 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757}
11758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011759PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011762Return True if all characters in S are digits\n\
11763and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
11765static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011766unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 Py_ssize_t i, length;
11769 int kind;
11770 void *data;
11771
11772 if (PyUnicode_READY(self) == -1)
11773 return NULL;
11774 length = PyUnicode_GET_LENGTH(self);
11775 kind = PyUnicode_KIND(self);
11776 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 if (length == 1) {
11780 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11781 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011784 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 for (i = 0; i < length; i++) {
11789 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011792 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793}
11794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011795PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011798Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011799False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800
11801static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011802unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 Py_ssize_t i, length;
11805 int kind;
11806 void *data;
11807
11808 if (PyUnicode_READY(self) == -1)
11809 return NULL;
11810 length = PyUnicode_GET_LENGTH(self);
11811 kind = PyUnicode_KIND(self);
11812 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 if (length == 1)
11816 return PyBool_FromLong(
11817 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011819 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 for (i = 0; i < length; i++) {
11824 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011827 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828}
11829
Martin v. Löwis47383402007-08-15 07:32:56 +000011830int
11831PyUnicode_IsIdentifier(PyObject *self)
11832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 int kind;
11834 void *data;
11835 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011836 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 if (PyUnicode_READY(self) == -1) {
11839 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 }
11842
11843 /* Special case for empty strings */
11844 if (PyUnicode_GET_LENGTH(self) == 0)
11845 return 0;
11846 kind = PyUnicode_KIND(self);
11847 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011848
11849 /* PEP 3131 says that the first character must be in
11850 XID_Start and subsequent characters in XID_Continue,
11851 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011852 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011853 letters, digits, underscore). However, given the current
11854 definition of XID_Start and XID_Continue, it is sufficient
11855 to check just for these, except that _ must be allowed
11856 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011858 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011859 return 0;
11860
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011861 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011864 return 1;
11865}
11866
11867PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011869\n\
11870Return True if S is a valid identifier according\n\
11871to the language definition.");
11872
11873static PyObject*
11874unicode_isidentifier(PyObject *self)
11875{
11876 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11877}
11878
Georg Brandl559e5d72008-06-11 18:37:52 +000011879PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011881\n\
11882Return True if all characters in S are considered\n\
11883printable in repr() or S is empty, False otherwise.");
11884
11885static PyObject*
11886unicode_isprintable(PyObject *self)
11887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 Py_ssize_t i, length;
11889 int kind;
11890 void *data;
11891
11892 if (PyUnicode_READY(self) == -1)
11893 return NULL;
11894 length = PyUnicode_GET_LENGTH(self);
11895 kind = PyUnicode_KIND(self);
11896 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011897
11898 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 if (length == 1)
11900 return PyBool_FromLong(
11901 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 for (i = 0; i < length; i++) {
11904 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011905 Py_RETURN_FALSE;
11906 }
11907 }
11908 Py_RETURN_TRUE;
11909}
11910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011911PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011912 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913\n\
11914Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011915iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916
11917static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011918unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011920 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921}
11922
Martin v. Löwis18e16552006-02-15 17:27:45 +000011923static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011924unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 if (PyUnicode_READY(self) == -1)
11927 return -1;
11928 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929}
11930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011931PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011934Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011935done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936
11937static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011938unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011940 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 Py_UCS4 fillchar = ' ';
11942
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011943 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944 return NULL;
11945
Benjamin Petersonbac79492012-01-14 13:34:47 -050011946 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948
Victor Stinnerc4b49542011-12-11 22:44:26 +010011949 if (PyUnicode_GET_LENGTH(self) >= width)
11950 return unicode_result_unchanged(self);
11951
11952 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953}
11954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011955PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011958Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959
11960static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011961unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011963 if (PyUnicode_READY(self) == -1)
11964 return NULL;
11965 if (PyUnicode_IS_ASCII(self))
11966 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011967 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968}
11969
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011970#define LEFTSTRIP 0
11971#define RIGHTSTRIP 1
11972#define BOTHSTRIP 2
11973
11974/* Arrays indexed by above */
11975static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11976
11977#define STRIPNAME(i) (stripformat[i]+3)
11978
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011979/* externally visible for str.strip(unicode) */
11980PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011981_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 void *data;
11984 int kind;
11985 Py_ssize_t i, j, len;
11986 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11989 return NULL;
11990
11991 kind = PyUnicode_KIND(self);
11992 data = PyUnicode_DATA(self);
11993 len = PyUnicode_GET_LENGTH(self);
11994 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11995 PyUnicode_DATA(sepobj),
11996 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011997
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 i = 0;
11999 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 while (i < len &&
12001 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 i++;
12003 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012004 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012005
Benjamin Peterson14339b62009-01-31 16:36:08 +000012006 j = len;
12007 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 do {
12009 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 } while (j >= i &&
12011 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012013 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012014
Victor Stinner7931d9a2011-11-04 00:22:48 +010012015 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016}
12017
12018PyObject*
12019PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12020{
12021 unsigned char *data;
12022 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012023 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024
Victor Stinnerde636f32011-10-01 03:55:54 +020012025 if (PyUnicode_READY(self) == -1)
12026 return NULL;
12027
12028 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
12029
Victor Stinner12bab6d2011-10-01 01:53:49 +020012030 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010012031 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032
Victor Stinner12bab6d2011-10-01 01:53:49 +020012033 length = end - start;
12034 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012035 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036
Victor Stinnerde636f32011-10-01 03:55:54 +020012037 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012038 PyErr_SetString(PyExc_IndexError, "string index out of range");
12039 return NULL;
12040 }
12041
Victor Stinnerb9275c12011-10-05 14:01:42 +020012042 if (PyUnicode_IS_ASCII(self)) {
12043 kind = PyUnicode_KIND(self);
12044 data = PyUnicode_1BYTE_DATA(self);
12045 return unicode_fromascii(data + start, length);
12046 }
12047 else {
12048 kind = PyUnicode_KIND(self);
12049 data = PyUnicode_1BYTE_DATA(self);
12050 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012051 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012052 length);
12053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055
12056static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012057do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 int kind;
12060 void *data;
12061 Py_ssize_t len, i, j;
12062
12063 if (PyUnicode_READY(self) == -1)
12064 return NULL;
12065
12066 kind = PyUnicode_KIND(self);
12067 data = PyUnicode_DATA(self);
12068 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012069
Benjamin Peterson14339b62009-01-31 16:36:08 +000012070 i = 0;
12071 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012073 i++;
12074 }
12075 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012076
Benjamin Peterson14339b62009-01-31 16:36:08 +000012077 j = len;
12078 if (striptype != LEFTSTRIP) {
12079 do {
12080 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012082 j++;
12083 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012084
Victor Stinner7931d9a2011-11-04 00:22:48 +010012085 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086}
12087
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012088
12089static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012090do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012091{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012092 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012093
Benjamin Peterson14339b62009-01-31 16:36:08 +000012094 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12095 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012096
Benjamin Peterson14339b62009-01-31 16:36:08 +000012097 if (sep != NULL && sep != Py_None) {
12098 if (PyUnicode_Check(sep))
12099 return _PyUnicode_XStrip(self, striptype, sep);
12100 else {
12101 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012102 "%s arg must be None or str",
12103 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012104 return NULL;
12105 }
12106 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012107
Benjamin Peterson14339b62009-01-31 16:36:08 +000012108 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012109}
12110
12111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012112PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012114\n\
12115Return a copy of the string S with leading and trailing\n\
12116whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012117If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012118
12119static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012120unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012122 if (PyTuple_GET_SIZE(args) == 0)
12123 return do_strip(self, BOTHSTRIP); /* Common case */
12124 else
12125 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012126}
12127
12128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012129PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131\n\
12132Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012133If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012134
12135static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012136unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012138 if (PyTuple_GET_SIZE(args) == 0)
12139 return do_strip(self, LEFTSTRIP); /* Common case */
12140 else
12141 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012142}
12143
12144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012145PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012146 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012147\n\
12148Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012149If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012150
12151static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012152unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012153{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012154 if (PyTuple_GET_SIZE(args) == 0)
12155 return do_strip(self, RIGHTSTRIP); /* Common case */
12156 else
12157 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012158}
12159
12160
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012162unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012164 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166
Georg Brandl222de0f2009-04-12 12:01:50 +000012167 if (len < 1) {
12168 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012169 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171
Victor Stinnerc4b49542011-12-11 22:44:26 +010012172 /* no repeat, return original string */
12173 if (len == 1)
12174 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012175
Benjamin Petersonbac79492012-01-14 13:34:47 -050012176 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 return NULL;
12178
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012179 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012180 PyErr_SetString(PyExc_OverflowError,
12181 "repeated string is too long");
12182 return NULL;
12183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012185
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012186 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 if (!u)
12188 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012189 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 if (PyUnicode_GET_LENGTH(str) == 1) {
12192 const int kind = PyUnicode_KIND(str);
12193 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012194 if (kind == PyUnicode_1BYTE_KIND) {
12195 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012196 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012197 }
12198 else if (kind == PyUnicode_2BYTE_KIND) {
12199 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012200 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012201 ucs2[n] = fill_char;
12202 } else {
12203 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12204 assert(kind == PyUnicode_4BYTE_KIND);
12205 for (n = 0; n < len; ++n)
12206 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 }
12209 else {
12210 /* number of characters copied this far */
12211 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012212 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 char *to = (char *) PyUnicode_DATA(u);
12214 Py_MEMCPY(to, PyUnicode_DATA(str),
12215 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012216 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 n = (done <= nchars-done) ? done : nchars-done;
12218 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012219 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221 }
12222
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012223 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012224 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225}
12226
Alexander Belopolsky40018472011-02-26 01:02:56 +000012227PyObject *
12228PyUnicode_Replace(PyObject *obj,
12229 PyObject *subobj,
12230 PyObject *replobj,
12231 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232{
12233 PyObject *self;
12234 PyObject *str1;
12235 PyObject *str2;
12236 PyObject *result;
12237
12238 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012239 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012242 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 Py_DECREF(self);
12244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245 }
12246 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012247 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012248 Py_DECREF(self);
12249 Py_DECREF(str1);
12250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012252 if (PyUnicode_READY(self) == -1 ||
12253 PyUnicode_READY(str1) == -1 ||
12254 PyUnicode_READY(str2) == -1)
12255 result = NULL;
12256 else
12257 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258 Py_DECREF(self);
12259 Py_DECREF(str1);
12260 Py_DECREF(str2);
12261 return result;
12262}
12263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012264PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012265 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266\n\
12267Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012268old replaced by new. If the optional argument count is\n\
12269given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270
12271static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 PyObject *str1;
12275 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012276 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 PyObject *result;
12278
Martin v. Löwis18e16552006-02-15 17:27:45 +000012279 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012281 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012284 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 return NULL;
12286 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012287 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 Py_DECREF(str1);
12289 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012290 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012291 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12292 result = NULL;
12293 else
12294 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295
12296 Py_DECREF(str1);
12297 Py_DECREF(str2);
12298 return result;
12299}
12300
Alexander Belopolsky40018472011-02-26 01:02:56 +000012301static PyObject *
12302unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012304 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 Py_ssize_t isize;
12306 Py_ssize_t osize, squote, dquote, i, o;
12307 Py_UCS4 max, quote;
12308 int ikind, okind;
12309 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012312 return NULL;
12313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 isize = PyUnicode_GET_LENGTH(unicode);
12315 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 /* Compute length of output, quote characters, and
12318 maximum character */
12319 osize = 2; /* quotes */
12320 max = 127;
12321 squote = dquote = 0;
12322 ikind = PyUnicode_KIND(unicode);
12323 for (i = 0; i < isize; i++) {
12324 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12325 switch (ch) {
12326 case '\'': squote++; osize++; break;
12327 case '"': dquote++; osize++; break;
12328 case '\\': case '\t': case '\r': case '\n':
12329 osize += 2; break;
12330 default:
12331 /* Fast-path ASCII */
12332 if (ch < ' ' || ch == 0x7f)
12333 osize += 4; /* \xHH */
12334 else if (ch < 0x7f)
12335 osize++;
12336 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12337 osize++;
12338 max = ch > max ? ch : max;
12339 }
12340 else if (ch < 0x100)
12341 osize += 4; /* \xHH */
12342 else if (ch < 0x10000)
12343 osize += 6; /* \uHHHH */
12344 else
12345 osize += 10; /* \uHHHHHHHH */
12346 }
12347 }
12348
12349 quote = '\'';
12350 if (squote) {
12351 if (dquote)
12352 /* Both squote and dquote present. Use squote,
12353 and escape them */
12354 osize += squote;
12355 else
12356 quote = '"';
12357 }
12358
12359 repr = PyUnicode_New(osize, max);
12360 if (repr == NULL)
12361 return NULL;
12362 okind = PyUnicode_KIND(repr);
12363 odata = PyUnicode_DATA(repr);
12364
12365 PyUnicode_WRITE(okind, odata, 0, quote);
12366 PyUnicode_WRITE(okind, odata, osize-1, quote);
12367
12368 for (i = 0, o = 1; i < isize; i++) {
12369 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012370
12371 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 if ((ch == quote) || (ch == '\\')) {
12373 PyUnicode_WRITE(okind, odata, o++, '\\');
12374 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012375 continue;
12376 }
12377
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012379 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 PyUnicode_WRITE(okind, odata, o++, '\\');
12381 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012382 }
12383 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 PyUnicode_WRITE(okind, odata, o++, '\\');
12385 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012386 }
12387 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 PyUnicode_WRITE(okind, odata, o++, '\\');
12389 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012390 }
12391
12392 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012393 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 PyUnicode_WRITE(okind, odata, o++, '\\');
12395 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012396 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12397 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012398 }
12399
Georg Brandl559e5d72008-06-11 18:37:52 +000012400 /* Copy ASCII characters as-is */
12401 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012403 }
12404
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012406 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012407 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012408 (categories Z* and C* except ASCII space)
12409 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012411 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 if (ch <= 0xff) {
12413 PyUnicode_WRITE(okind, odata, o++, '\\');
12414 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012415 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12416 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012417 }
12418 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 else if (ch >= 0x10000) {
12420 PyUnicode_WRITE(okind, odata, o++, '\\');
12421 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012422 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12423 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12424 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12425 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12426 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12427 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12428 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12429 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012430 }
12431 /* Map 16-bit characters to '\uxxxx' */
12432 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 PyUnicode_WRITE(okind, odata, o++, '\\');
12434 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12437 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12438 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012439 }
12440 }
12441 /* Copy characters as-is */
12442 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012444 }
12445 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012448 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012449 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450}
12451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012452PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454\n\
12455Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012456such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457arguments start and end are interpreted as in slice notation.\n\
12458\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012459Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460
12461static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012464 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012465 Py_ssize_t start;
12466 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012467 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468
Jesus Ceaac451502011-04-20 17:09:23 +020012469 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12470 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 if (PyUnicode_READY(self) == -1)
12474 return NULL;
12475 if (PyUnicode_READY(substring) == -1)
12476 return NULL;
12477
Victor Stinner7931d9a2011-11-04 00:22:48 +010012478 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479
12480 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 if (result == -2)
12483 return NULL;
12484
Christian Heimes217cfd12007-12-02 14:31:20 +000012485 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486}
12487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012488PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012489 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012491Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492
12493static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012496 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012497 Py_ssize_t start;
12498 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012499 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
Jesus Ceaac451502011-04-20 17:09:23 +020012501 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12502 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 if (PyUnicode_READY(self) == -1)
12506 return NULL;
12507 if (PyUnicode_READY(substring) == -1)
12508 return NULL;
12509
Victor Stinner7931d9a2011-11-04 00:22:48 +010012510 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511
12512 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 if (result == -2)
12515 return NULL;
12516
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517 if (result < 0) {
12518 PyErr_SetString(PyExc_ValueError, "substring not found");
12519 return NULL;
12520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521
Christian Heimes217cfd12007-12-02 14:31:20 +000012522 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523}
12524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012525PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012526 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012528Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012529done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
12531static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012532unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012534 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 Py_UCS4 fillchar = ' ';
12536
Victor Stinnere9a29352011-10-01 02:14:59 +020012537 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012539
Benjamin Petersonbac79492012-01-14 13:34:47 -050012540 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541 return NULL;
12542
Victor Stinnerc4b49542011-12-11 22:44:26 +010012543 if (PyUnicode_GET_LENGTH(self) >= width)
12544 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
Victor Stinnerc4b49542011-12-11 22:44:26 +010012546 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547}
12548
Alexander Belopolsky40018472011-02-26 01:02:56 +000012549PyObject *
12550PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551{
12552 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012553
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554 s = PyUnicode_FromObject(s);
12555 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012556 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012557 if (sep != NULL) {
12558 sep = PyUnicode_FromObject(sep);
12559 if (sep == NULL) {
12560 Py_DECREF(s);
12561 return NULL;
12562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563 }
12564
Victor Stinner9310abb2011-10-05 00:59:23 +020012565 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566
12567 Py_DECREF(s);
12568 Py_XDECREF(sep);
12569 return result;
12570}
12571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012572PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012573 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574\n\
12575Return a list of the words in S, using sep as the\n\
12576delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012577splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012578whitespace string is a separator and empty strings are\n\
12579removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580
12581static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012582unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012584 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012586 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012588 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12589 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590 return NULL;
12591
12592 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012595 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012597 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598}
12599
Thomas Wouters477c8d52006-05-27 19:21:47 +000012600PyObject *
12601PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12602{
12603 PyObject* str_obj;
12604 PyObject* sep_obj;
12605 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 int kind1, kind2, kind;
12607 void *buf1 = NULL, *buf2 = NULL;
12608 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012609
12610 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012611 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012613 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012614 if (!sep_obj) {
12615 Py_DECREF(str_obj);
12616 return NULL;
12617 }
12618 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12619 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012620 Py_DECREF(str_obj);
12621 return NULL;
12622 }
12623
Victor Stinner14f8f022011-10-05 20:58:25 +020012624 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012626 kind = Py_MAX(kind1, kind2);
12627 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012629 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 if (!buf1)
12631 goto onError;
12632 buf2 = PyUnicode_DATA(sep_obj);
12633 if (kind2 != kind)
12634 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12635 if (!buf2)
12636 goto onError;
12637 len1 = PyUnicode_GET_LENGTH(str_obj);
12638 len2 = PyUnicode_GET_LENGTH(sep_obj);
12639
Benjamin Petersonead6b532011-12-20 17:23:42 -060012640 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012642 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12643 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12644 else
12645 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 break;
12647 case PyUnicode_2BYTE_KIND:
12648 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12649 break;
12650 case PyUnicode_4BYTE_KIND:
12651 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12652 break;
12653 default:
12654 assert(0);
12655 out = 0;
12656 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012657
12658 Py_DECREF(sep_obj);
12659 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 if (kind1 != kind)
12661 PyMem_Free(buf1);
12662 if (kind2 != kind)
12663 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012664
12665 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 onError:
12667 Py_DECREF(sep_obj);
12668 Py_DECREF(str_obj);
12669 if (kind1 != kind && buf1)
12670 PyMem_Free(buf1);
12671 if (kind2 != kind && buf2)
12672 PyMem_Free(buf2);
12673 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012674}
12675
12676
12677PyObject *
12678PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12679{
12680 PyObject* str_obj;
12681 PyObject* sep_obj;
12682 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 int kind1, kind2, kind;
12684 void *buf1 = NULL, *buf2 = NULL;
12685 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012686
12687 str_obj = PyUnicode_FromObject(str_in);
12688 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012690 sep_obj = PyUnicode_FromObject(sep_in);
12691 if (!sep_obj) {
12692 Py_DECREF(str_obj);
12693 return NULL;
12694 }
12695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 kind1 = PyUnicode_KIND(str_in);
12697 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012698 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 buf1 = PyUnicode_DATA(str_in);
12700 if (kind1 != kind)
12701 buf1 = _PyUnicode_AsKind(str_in, kind);
12702 if (!buf1)
12703 goto onError;
12704 buf2 = PyUnicode_DATA(sep_obj);
12705 if (kind2 != kind)
12706 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12707 if (!buf2)
12708 goto onError;
12709 len1 = PyUnicode_GET_LENGTH(str_obj);
12710 len2 = PyUnicode_GET_LENGTH(sep_obj);
12711
Benjamin Petersonead6b532011-12-20 17:23:42 -060012712 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012714 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12715 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12716 else
12717 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 break;
12719 case PyUnicode_2BYTE_KIND:
12720 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12721 break;
12722 case PyUnicode_4BYTE_KIND:
12723 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12724 break;
12725 default:
12726 assert(0);
12727 out = 0;
12728 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012729
12730 Py_DECREF(sep_obj);
12731 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 if (kind1 != kind)
12733 PyMem_Free(buf1);
12734 if (kind2 != kind)
12735 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012736
12737 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 onError:
12739 Py_DECREF(sep_obj);
12740 Py_DECREF(str_obj);
12741 if (kind1 != kind && buf1)
12742 PyMem_Free(buf1);
12743 if (kind2 != kind && buf2)
12744 PyMem_Free(buf2);
12745 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012746}
12747
12748PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012750\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012751Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012752the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012753found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012754
12755static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012756unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757{
Victor Stinner9310abb2011-10-05 00:59:23 +020012758 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012759}
12760
12761PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012762 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012763\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012764Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012765the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012766separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012767
12768static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012769unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012770{
Victor Stinner9310abb2011-10-05 00:59:23 +020012771 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012772}
12773
Alexander Belopolsky40018472011-02-26 01:02:56 +000012774PyObject *
12775PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012776{
12777 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012778
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012779 s = PyUnicode_FromObject(s);
12780 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012782 if (sep != NULL) {
12783 sep = PyUnicode_FromObject(sep);
12784 if (sep == NULL) {
12785 Py_DECREF(s);
12786 return NULL;
12787 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012788 }
12789
Victor Stinner9310abb2011-10-05 00:59:23 +020012790 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012791
12792 Py_DECREF(s);
12793 Py_XDECREF(sep);
12794 return result;
12795}
12796
12797PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012798 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012799\n\
12800Return a list of the words in S, using sep as the\n\
12801delimiter string, starting at the end of the string and\n\
12802working to the front. If maxsplit is given, at most maxsplit\n\
12803splits are done. If sep is not specified, any whitespace string\n\
12804is a separator.");
12805
12806static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012807unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012808{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012809 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012810 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012811 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012812
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012813 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12814 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012815 return NULL;
12816
12817 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012819 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012820 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012821 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012822 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012823}
12824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012825PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827\n\
12828Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012829Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012830is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831
12832static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012833unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012835 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012836 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012838 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12839 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840 return NULL;
12841
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012842 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843}
12844
12845static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012846PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012848 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849}
12850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012851PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012852 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853\n\
12854Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012855and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856
12857static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012858unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012860 if (PyUnicode_READY(self) == -1)
12861 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012862 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863}
12864
Georg Brandlceee0772007-11-27 23:48:05 +000012865PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012867\n\
12868Return a translation table usable for str.translate().\n\
12869If there is only one argument, it must be a dictionary mapping Unicode\n\
12870ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012871Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012872If there are two arguments, they must be strings of equal length, and\n\
12873in the resulting dictionary, each character in x will be mapped to the\n\
12874character at the same position in y. If there is a third argument, it\n\
12875must be a string, whose characters will be mapped to None in the result.");
12876
12877static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012878unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012879{
12880 PyObject *x, *y = NULL, *z = NULL;
12881 PyObject *new = NULL, *key, *value;
12882 Py_ssize_t i = 0;
12883 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012884
Georg Brandlceee0772007-11-27 23:48:05 +000012885 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12886 return NULL;
12887 new = PyDict_New();
12888 if (!new)
12889 return NULL;
12890 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 int x_kind, y_kind, z_kind;
12892 void *x_data, *y_data, *z_data;
12893
Georg Brandlceee0772007-11-27 23:48:05 +000012894 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012895 if (!PyUnicode_Check(x)) {
12896 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12897 "be a string if there is a second argument");
12898 goto err;
12899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012901 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12902 "arguments must have equal length");
12903 goto err;
12904 }
12905 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 x_kind = PyUnicode_KIND(x);
12907 y_kind = PyUnicode_KIND(y);
12908 x_data = PyUnicode_DATA(x);
12909 y_data = PyUnicode_DATA(y);
12910 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12911 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012912 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012913 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012914 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012915 if (!value) {
12916 Py_DECREF(key);
12917 goto err;
12918 }
Georg Brandlceee0772007-11-27 23:48:05 +000012919 res = PyDict_SetItem(new, key, value);
12920 Py_DECREF(key);
12921 Py_DECREF(value);
12922 if (res < 0)
12923 goto err;
12924 }
12925 /* create entries for deleting chars in z */
12926 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 z_kind = PyUnicode_KIND(z);
12928 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012929 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012931 if (!key)
12932 goto err;
12933 res = PyDict_SetItem(new, key, Py_None);
12934 Py_DECREF(key);
12935 if (res < 0)
12936 goto err;
12937 }
12938 }
12939 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 int kind;
12941 void *data;
12942
Georg Brandlceee0772007-11-27 23:48:05 +000012943 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012944 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012945 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12946 "to maketrans it must be a dict");
12947 goto err;
12948 }
12949 /* copy entries into the new dict, converting string keys to int keys */
12950 while (PyDict_Next(x, &i, &key, &value)) {
12951 if (PyUnicode_Check(key)) {
12952 /* convert string keys to integer keys */
12953 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012954 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012955 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12956 "table must be of length 1");
12957 goto err;
12958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 kind = PyUnicode_KIND(key);
12960 data = PyUnicode_DATA(key);
12961 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012962 if (!newkey)
12963 goto err;
12964 res = PyDict_SetItem(new, newkey, value);
12965 Py_DECREF(newkey);
12966 if (res < 0)
12967 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012968 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012969 /* just keep integer keys */
12970 if (PyDict_SetItem(new, key, value) < 0)
12971 goto err;
12972 } else {
12973 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12974 "be strings or integers");
12975 goto err;
12976 }
12977 }
12978 }
12979 return new;
12980 err:
12981 Py_DECREF(new);
12982 return NULL;
12983}
12984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012985PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987\n\
12988Return a copy of the string S, where all characters have been mapped\n\
12989through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012990Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012991Unmapped characters are left untouched. Characters mapped to None\n\
12992are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993
12994static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012998}
12999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013000PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013003Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013004
13005static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013006unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013008 if (PyUnicode_READY(self) == -1)
13009 return NULL;
13010 if (PyUnicode_IS_ASCII(self))
13011 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013012 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013013}
13014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013015PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013018Pad a numeric string S with zeros on the left, to fill a field\n\
13019of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020
13021static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013022unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013024 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013025 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013026 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 int kind;
13028 void *data;
13029 Py_UCS4 chr;
13030
Martin v. Löwis18e16552006-02-15 17:27:45 +000013031 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032 return NULL;
13033
Benjamin Petersonbac79492012-01-14 13:34:47 -050013034 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013036
Victor Stinnerc4b49542011-12-11 22:44:26 +010013037 if (PyUnicode_GET_LENGTH(self) >= width)
13038 return unicode_result_unchanged(self);
13039
13040 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041
13042 u = pad(self, fill, 0, '0');
13043
Walter Dörwald068325e2002-04-15 13:36:47 +000013044 if (u == NULL)
13045 return NULL;
13046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 kind = PyUnicode_KIND(u);
13048 data = PyUnicode_DATA(u);
13049 chr = PyUnicode_READ(kind, data, fill);
13050
13051 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013053 PyUnicode_WRITE(kind, data, 0, chr);
13054 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055 }
13056
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013057 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013058 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060
13061#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013062static PyObject *
13063unicode__decimal2ascii(PyObject *self)
13064{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013066}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067#endif
13068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013069PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013072Return True if S starts with the specified prefix, False otherwise.\n\
13073With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013074With optional end, stop comparing S at that position.\n\
13075prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076
13077static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013078unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013079 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013081 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013082 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013083 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013084 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013085 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086
Jesus Ceaac451502011-04-20 17:09:23 +020013087 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013088 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013089 if (PyTuple_Check(subobj)) {
13090 Py_ssize_t i;
13091 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013092 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013093 if (substring == NULL)
13094 return NULL;
13095 result = tailmatch(self, substring, start, end, -1);
13096 Py_DECREF(substring);
13097 if (result) {
13098 Py_RETURN_TRUE;
13099 }
13100 }
13101 /* nothing matched */
13102 Py_RETURN_FALSE;
13103 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013104 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013105 if (substring == NULL) {
13106 if (PyErr_ExceptionMatches(PyExc_TypeError))
13107 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13108 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013110 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013111 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013113 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114}
13115
13116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013117PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013118 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013120Return True if S ends with the specified suffix, False otherwise.\n\
13121With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013122With optional end, stop comparing S at that position.\n\
13123suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124
13125static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013126unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013127 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013129 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013130 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013131 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013132 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013133 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134
Jesus Ceaac451502011-04-20 17:09:23 +020013135 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013137 if (PyTuple_Check(subobj)) {
13138 Py_ssize_t i;
13139 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013140 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013142 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013144 result = tailmatch(self, substring, start, end, +1);
13145 Py_DECREF(substring);
13146 if (result) {
13147 Py_RETURN_TRUE;
13148 }
13149 }
13150 Py_RETURN_FALSE;
13151 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013152 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013153 if (substring == NULL) {
13154 if (PyErr_ExceptionMatches(PyExc_TypeError))
13155 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13156 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013158 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013159 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013161 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162}
13163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013165
13166PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013168\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013169Return a formatted version of S, using substitutions from args and kwargs.\n\
13170The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013171
Eric Smith27bbca62010-11-04 17:06:58 +000013172PyDoc_STRVAR(format_map__doc__,
13173 "S.format_map(mapping) -> str\n\
13174\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013175Return a formatted version of S, using substitutions from mapping.\n\
13176The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013177
Eric Smith4a7d76d2008-05-30 18:10:19 +000013178static PyObject *
13179unicode__format__(PyObject* self, PyObject* args)
13180{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013181 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013182
13183 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13184 return NULL;
13185
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013186 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013187 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013188 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013189}
13190
Eric Smith8c663262007-08-25 02:26:07 +000013191PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013193\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013194Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013195
13196static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013197unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013198{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 Py_ssize_t size;
13200
13201 /* If it's a compact object, account for base structure +
13202 character data. */
13203 if (PyUnicode_IS_COMPACT_ASCII(v))
13204 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13205 else if (PyUnicode_IS_COMPACT(v))
13206 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013207 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 else {
13209 /* If it is a two-block object, account for base object, and
13210 for character block if present. */
13211 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013212 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013214 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 }
13216 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013217 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013218 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013220 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013221 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013222
13223 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013224}
13225
13226PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013228
13229static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013230unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013231{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013232 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 if (!copy)
13234 return NULL;
13235 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013236}
13237
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013239 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013240 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013241 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13242 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013243 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13244 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013245 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013246 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13247 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13248 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13249 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13250 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013251 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013252 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13253 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13254 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013255 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013256 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13257 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13258 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013259 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013260 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013261 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013262 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013263 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13264 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13265 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13266 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13267 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13268 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13269 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13270 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13271 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13272 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13273 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13274 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13275 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13276 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013277 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013278 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013279 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013280 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013281 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013282 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013283 {"maketrans", (PyCFunction) unicode_maketrans,
13284 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013285 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013286#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013287 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013288 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289#endif
13290
Benjamin Peterson14339b62009-01-31 16:36:08 +000013291 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292 {NULL, NULL}
13293};
13294
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013295static PyObject *
13296unicode_mod(PyObject *v, PyObject *w)
13297{
Brian Curtindfc80e32011-08-10 20:28:54 -050013298 if (!PyUnicode_Check(v))
13299 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013301}
13302
13303static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013304 0, /*nb_add*/
13305 0, /*nb_subtract*/
13306 0, /*nb_multiply*/
13307 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013308};
13309
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 (lenfunc) unicode_length, /* sq_length */
13312 PyUnicode_Concat, /* sq_concat */
13313 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13314 (ssizeargfunc) unicode_getitem, /* sq_item */
13315 0, /* sq_slice */
13316 0, /* sq_ass_item */
13317 0, /* sq_ass_slice */
13318 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013319};
13320
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013321static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013322unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013324 if (PyUnicode_READY(self) == -1)
13325 return NULL;
13326
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013327 if (PyIndex_Check(item)) {
13328 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013329 if (i == -1 && PyErr_Occurred())
13330 return NULL;
13331 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013333 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013334 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013335 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013336 PyObject *result;
13337 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013338 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013339 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013341 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013342 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013343 return NULL;
13344 }
13345
13346 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013347 Py_INCREF(unicode_empty);
13348 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013350 slicelength == PyUnicode_GET_LENGTH(self)) {
13351 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013352 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013353 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013354 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013355 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013356 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013357 src_kind = PyUnicode_KIND(self);
13358 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013359 if (!PyUnicode_IS_ASCII(self)) {
13360 kind_limit = kind_maxchar_limit(src_kind);
13361 max_char = 0;
13362 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13363 ch = PyUnicode_READ(src_kind, src_data, cur);
13364 if (ch > max_char) {
13365 max_char = ch;
13366 if (max_char >= kind_limit)
13367 break;
13368 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013369 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013370 }
Victor Stinner55c99112011-10-13 01:17:06 +020013371 else
13372 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013373 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013374 if (result == NULL)
13375 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013376 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013377 dest_data = PyUnicode_DATA(result);
13378
13379 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013380 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13381 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013382 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013383 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013384 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013385 } else {
13386 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13387 return NULL;
13388 }
13389}
13390
13391static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013392 (lenfunc)unicode_length, /* mp_length */
13393 (binaryfunc)unicode_subscript, /* mp_subscript */
13394 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013395};
13396
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398/* Helpers for PyUnicode_Format() */
13399
13400static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013401getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013402{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013403 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 (*p_argidx)++;
13406 if (arglen < 0)
13407 return args;
13408 else
13409 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013410 }
13411 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413 return NULL;
13414}
13415
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013416/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013418static PyObject *
13419formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013421 char *p;
13422 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013424
Guido van Rossumd57fd912000-03-10 22:53:23 +000013425 x = PyFloat_AsDouble(v);
13426 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013427 return NULL;
13428
Guido van Rossumd57fd912000-03-10 22:53:23 +000013429 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013431
Eric Smith0923d1d2009-04-16 20:16:10 +000013432 p = PyOS_double_to_string(x, type, prec,
13433 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013434 if (p == NULL)
13435 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013436 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013437 PyMem_Free(p);
13438 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439}
13440
Victor Stinnerd0880d52012-04-27 23:40:13 +020013441/* formatlong() emulates the format codes d, u, o, x and X, and
13442 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13443 * Python's regular ints.
13444 * Return value: a new PyUnicodeObject*, or NULL if error.
13445 * The output string is of the form
13446 * "-"? ("0x" | "0X")? digit+
13447 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13448 * set in flags. The case of hex digits will be correct,
13449 * There will be at least prec digits, zero-filled on the left if
13450 * necessary to get that many.
13451 * val object to be converted
13452 * flags bitmask of format flags; only F_ALT is looked at
13453 * prec minimum number of digits; 0-fill on left if needed
13454 * type a character in [duoxX]; u acts the same as d
13455 *
13456 * CAUTION: o, x and X conversions on regular ints can never
13457 * produce a '-' sign, but can for Python's unbounded ints.
13458 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013459static PyObject*
13460formatlong(PyObject *val, int flags, int prec, int type)
13461{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013462 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013463 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013464 Py_ssize_t i;
13465 int sign; /* 1 if '-', else 0 */
13466 int len; /* number of characters */
13467 Py_ssize_t llen;
13468 int numdigits; /* len == numnondigits + numdigits */
13469 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013470
Victor Stinnerd0880d52012-04-27 23:40:13 +020013471 /* Avoid exceeding SSIZE_T_MAX */
13472 if (prec > INT_MAX-3) {
13473 PyErr_SetString(PyExc_OverflowError,
13474 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013475 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013476 }
13477
13478 assert(PyLong_Check(val));
13479
13480 switch (type) {
13481 case 'd':
13482 case 'u':
13483 /* Special-case boolean: we want 0/1 */
13484 result = Py_TYPE(val)->tp_str(val);
13485 break;
13486 case 'o':
13487 numnondigits = 2;
13488 result = PyNumber_ToBase(val, 8);
13489 break;
13490 case 'x':
13491 case 'X':
13492 numnondigits = 2;
13493 result = PyNumber_ToBase(val, 16);
13494 break;
13495 default:
13496 assert(!"'type' not in [duoxX]");
13497 }
13498 if (!result)
13499 return NULL;
13500
13501 assert(unicode_modifiable(result));
13502 assert(PyUnicode_IS_READY(result));
13503 assert(PyUnicode_IS_ASCII(result));
13504
13505 /* To modify the string in-place, there can only be one reference. */
13506 if (Py_REFCNT(result) != 1) {
13507 PyErr_BadInternalCall();
13508 return NULL;
13509 }
13510 buf = PyUnicode_DATA(result);
13511 llen = PyUnicode_GET_LENGTH(result);
13512 if (llen > INT_MAX) {
13513 PyErr_SetString(PyExc_ValueError,
13514 "string too large in _PyBytes_FormatLong");
13515 return NULL;
13516 }
13517 len = (int)llen;
13518 sign = buf[0] == '-';
13519 numnondigits += sign;
13520 numdigits = len - numnondigits;
13521 assert(numdigits > 0);
13522
13523 /* Get rid of base marker unless F_ALT */
13524 if (((flags & F_ALT) == 0 &&
13525 (type == 'o' || type == 'x' || type == 'X'))) {
13526 assert(buf[sign] == '0');
13527 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13528 buf[sign+1] == 'o');
13529 numnondigits -= 2;
13530 buf += 2;
13531 len -= 2;
13532 if (sign)
13533 buf[0] = '-';
13534 assert(len == numnondigits + numdigits);
13535 assert(numdigits > 0);
13536 }
13537
13538 /* Fill with leading zeroes to meet minimum width. */
13539 if (prec > numdigits) {
13540 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13541 numnondigits + prec);
13542 char *b1;
13543 if (!r1) {
13544 Py_DECREF(result);
13545 return NULL;
13546 }
13547 b1 = PyBytes_AS_STRING(r1);
13548 for (i = 0; i < numnondigits; ++i)
13549 *b1++ = *buf++;
13550 for (i = 0; i < prec - numdigits; i++)
13551 *b1++ = '0';
13552 for (i = 0; i < numdigits; i++)
13553 *b1++ = *buf++;
13554 *b1 = '\0';
13555 Py_DECREF(result);
13556 result = r1;
13557 buf = PyBytes_AS_STRING(result);
13558 len = numnondigits + prec;
13559 }
13560
13561 /* Fix up case for hex conversions. */
13562 if (type == 'X') {
13563 /* Need to convert all lower case letters to upper case.
13564 and need to convert 0x to 0X (and -0x to -0X). */
13565 for (i = 0; i < len; i++)
13566 if (buf[i] >= 'a' && buf[i] <= 'x')
13567 buf[i] -= 'a'-'A';
13568 }
13569 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13570 PyObject *unicode;
13571 unicode = unicode_fromascii((unsigned char *)buf, len);
13572 Py_DECREF(result);
13573 result = unicode;
13574 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013575 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013576}
13577
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013578static Py_UCS4
13579formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013580{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013581 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013582 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013583 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013584 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 goto onError;
13587 }
13588 else {
13589 /* Integer input truncated to a character */
13590 long x;
13591 x = PyLong_AsLong(v);
13592 if (x == -1 && PyErr_Occurred())
13593 goto onError;
13594
Victor Stinner8faf8212011-12-08 22:14:11 +010013595 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 PyErr_SetString(PyExc_OverflowError,
13597 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013598 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013599 }
13600
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013601 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013602 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013603
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013605 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013607 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013608}
13609
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013610static int
13611repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13612{
13613 int r;
13614 assert(count > 0);
13615 assert(PyUnicode_Check(obj));
13616 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013617 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013618 if (repeated == NULL)
13619 return -1;
13620 r = _PyAccu_Accumulate(acc, repeated);
13621 Py_DECREF(repeated);
13622 return r;
13623 }
13624 else {
13625 do {
13626 if (_PyAccu_Accumulate(acc, obj))
13627 return -1;
13628 } while (--count);
13629 return 0;
13630 }
13631}
13632
Alexander Belopolsky40018472011-02-26 01:02:56 +000013633PyObject *
13634PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013635{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013636 void *fmt;
13637 int fmtkind;
13638 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013639 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013640 int r;
13641 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013642 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013643 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013644 PyObject *temp = NULL;
13645 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013646 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013647 _PyAccu acc;
13648 static PyObject *plus, *minus, *blank, *zero, *percent;
13649
13650 if (!plus && !(plus = get_latin1_char('+')))
13651 return NULL;
13652 if (!minus && !(minus = get_latin1_char('-')))
13653 return NULL;
13654 if (!blank && !(blank = get_latin1_char(' ')))
13655 return NULL;
13656 if (!zero && !(zero = get_latin1_char('0')))
13657 return NULL;
13658 if (!percent && !(percent = get_latin1_char('%')))
13659 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013660
Guido van Rossumd57fd912000-03-10 22:53:23 +000013661 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 PyErr_BadInternalCall();
13663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013664 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013665 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013666 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013667 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013668 if (PyUnicode_READY(uformat) == -1)
13669 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013670 if (_PyAccu_Init(&acc))
13671 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013672 fmt = PyUnicode_DATA(uformat);
13673 fmtkind = PyUnicode_KIND(uformat);
13674 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13675 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013676
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 arglen = PyTuple_Size(args);
13679 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013680 }
13681 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013682 arglen = -1;
13683 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013684 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013685 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013686 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013687 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013688
13689 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013690 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013691 PyObject *nonfmt;
13692 Py_ssize_t nonfmtpos;
13693 nonfmtpos = fmtpos++;
13694 while (fmtcnt >= 0 &&
13695 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13696 fmtpos++;
13697 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013698 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013699 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013700 if (nonfmt == NULL)
13701 goto onError;
13702 r = _PyAccu_Accumulate(&acc, nonfmt);
13703 Py_DECREF(nonfmt);
13704 if (r)
13705 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013706 }
13707 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 /* Got a format specifier */
13709 int flags = 0;
13710 Py_ssize_t width = -1;
13711 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013712 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013713 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 int isnumok;
13715 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013716 void *pbuf = NULL;
13717 Py_ssize_t pindex, len;
13718 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013720 fmtpos++;
13721 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13722 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013723 Py_ssize_t keylen;
13724 PyObject *key;
13725 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013726
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 if (dict == NULL) {
13728 PyErr_SetString(PyExc_TypeError,
13729 "format requires a mapping");
13730 goto onError;
13731 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013732 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013734 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013735 /* Skip over balanced parentheses */
13736 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013737 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013739 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013741 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013742 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013743 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013744 if (fmtcnt < 0 || pcount > 0) {
13745 PyErr_SetString(PyExc_ValueError,
13746 "incomplete format key");
13747 goto onError;
13748 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013749 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013750 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013751 if (key == NULL)
13752 goto onError;
13753 if (args_owned) {
13754 Py_DECREF(args);
13755 args_owned = 0;
13756 }
13757 args = PyObject_GetItem(dict, key);
13758 Py_DECREF(key);
13759 if (args == NULL) {
13760 goto onError;
13761 }
13762 args_owned = 1;
13763 arglen = -1;
13764 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013765 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013766 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013767 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013768 case '-': flags |= F_LJUST; continue;
13769 case '+': flags |= F_SIGN; continue;
13770 case ' ': flags |= F_BLANK; continue;
13771 case '#': flags |= F_ALT; continue;
13772 case '0': flags |= F_ZERO; continue;
13773 }
13774 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013775 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013776 if (c == '*') {
13777 v = getnextarg(args, arglen, &argidx);
13778 if (v == NULL)
13779 goto onError;
13780 if (!PyLong_Check(v)) {
13781 PyErr_SetString(PyExc_TypeError,
13782 "* wants int");
13783 goto onError;
13784 }
13785 width = PyLong_AsLong(v);
13786 if (width == -1 && PyErr_Occurred())
13787 goto onError;
13788 if (width < 0) {
13789 flags |= F_LJUST;
13790 width = -width;
13791 }
13792 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013793 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013794 }
13795 else if (c >= '0' && c <= '9') {
13796 width = c - '0';
13797 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013798 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013799 if (c < '0' || c > '9')
13800 break;
13801 if ((width*10) / 10 != width) {
13802 PyErr_SetString(PyExc_ValueError,
13803 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013804 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 }
13806 width = width*10 + (c - '0');
13807 }
13808 }
13809 if (c == '.') {
13810 prec = 0;
13811 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013813 if (c == '*') {
13814 v = getnextarg(args, arglen, &argidx);
13815 if (v == NULL)
13816 goto onError;
13817 if (!PyLong_Check(v)) {
13818 PyErr_SetString(PyExc_TypeError,
13819 "* wants int");
13820 goto onError;
13821 }
13822 prec = PyLong_AsLong(v);
13823 if (prec == -1 && PyErr_Occurred())
13824 goto onError;
13825 if (prec < 0)
13826 prec = 0;
13827 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013828 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013829 }
13830 else if (c >= '0' && c <= '9') {
13831 prec = c - '0';
13832 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013833 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013834 if (c < '0' || c > '9')
13835 break;
13836 if ((prec*10) / 10 != prec) {
13837 PyErr_SetString(PyExc_ValueError,
13838 "prec too big");
13839 goto onError;
13840 }
13841 prec = prec*10 + (c - '0');
13842 }
13843 }
13844 } /* prec */
13845 if (fmtcnt >= 0) {
13846 if (c == 'h' || c == 'l' || c == 'L') {
13847 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013848 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013849 }
13850 }
13851 if (fmtcnt < 0) {
13852 PyErr_SetString(PyExc_ValueError,
13853 "incomplete format");
13854 goto onError;
13855 }
13856 if (c != '%') {
13857 v = getnextarg(args, arglen, &argidx);
13858 if (v == NULL)
13859 goto onError;
13860 }
13861 sign = 0;
13862 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013863 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013864 switch (c) {
13865
13866 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013867 _PyAccu_Accumulate(&acc, percent);
13868 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013869
13870 case 's':
13871 case 'r':
13872 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013873 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013874 temp = v;
13875 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013876 }
13877 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013878 if (c == 's')
13879 temp = PyObject_Str(v);
13880 else if (c == 'r')
13881 temp = PyObject_Repr(v);
13882 else
13883 temp = PyObject_ASCII(v);
13884 if (temp == NULL)
13885 goto onError;
13886 if (PyUnicode_Check(temp))
13887 /* nothing to do */;
13888 else {
13889 Py_DECREF(temp);
13890 PyErr_SetString(PyExc_TypeError,
13891 "%s argument has non-string str()");
13892 goto onError;
13893 }
13894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013895 if (PyUnicode_READY(temp) == -1) {
13896 Py_CLEAR(temp);
13897 goto onError;
13898 }
13899 pbuf = PyUnicode_DATA(temp);
13900 kind = PyUnicode_KIND(temp);
13901 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013902 if (prec >= 0 && len > prec)
13903 len = prec;
13904 break;
13905
13906 case 'i':
13907 case 'd':
13908 case 'u':
13909 case 'o':
13910 case 'x':
13911 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013912 isnumok = 0;
13913 if (PyNumber_Check(v)) {
13914 PyObject *iobj=NULL;
13915
13916 if (PyLong_Check(v)) {
13917 iobj = v;
13918 Py_INCREF(iobj);
13919 }
13920 else {
13921 iobj = PyNumber_Long(v);
13922 }
13923 if (iobj!=NULL) {
13924 if (PyLong_Check(iobj)) {
13925 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013926 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013927 Py_DECREF(iobj);
13928 if (!temp)
13929 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013930 if (PyUnicode_READY(temp) == -1) {
13931 Py_CLEAR(temp);
13932 goto onError;
13933 }
13934 pbuf = PyUnicode_DATA(temp);
13935 kind = PyUnicode_KIND(temp);
13936 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 sign = 1;
13938 }
13939 else {
13940 Py_DECREF(iobj);
13941 }
13942 }
13943 }
13944 if (!isnumok) {
13945 PyErr_Format(PyExc_TypeError,
13946 "%%%c format: a number is required, "
13947 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13948 goto onError;
13949 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013950 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013951 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013952 fillobj = zero;
13953 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013954 break;
13955
13956 case 'e':
13957 case 'E':
13958 case 'f':
13959 case 'F':
13960 case 'g':
13961 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013962 temp = formatfloat(v, flags, prec, c);
13963 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013964 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013965 if (PyUnicode_READY(temp) == -1) {
13966 Py_CLEAR(temp);
13967 goto onError;
13968 }
13969 pbuf = PyUnicode_DATA(temp);
13970 kind = PyUnicode_KIND(temp);
13971 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013972 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013973 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013974 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013975 fillobj = zero;
13976 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013977 break;
13978
13979 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013980 {
13981 Py_UCS4 ch = formatchar(v);
13982 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013983 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013984 temp = _PyUnicode_FromUCS4(&ch, 1);
13985 if (temp == NULL)
13986 goto onError;
13987 pbuf = PyUnicode_DATA(temp);
13988 kind = PyUnicode_KIND(temp);
13989 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013990 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013991 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013992
13993 default:
13994 PyErr_Format(PyExc_ValueError,
13995 "unsupported format character '%c' (0x%x) "
13996 "at index %zd",
13997 (31<=c && c<=126) ? (char)c : '?',
13998 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013999 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 goto onError;
14001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014002 /* pbuf is initialized here. */
14003 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000014004 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014005 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
14006 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000014007 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014008 pindex++;
14009 }
14010 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
14011 signobj = plus;
14012 len--;
14013 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000014014 }
14015 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014016 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000014017 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014018 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000014019 else
14020 sign = 0;
14021 }
14022 if (width < len)
14023 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014024 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014025 if (fill != ' ') {
14026 assert(signobj != NULL);
14027 if (_PyAccu_Accumulate(&acc, signobj))
14028 goto onError;
14029 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014030 if (width > len)
14031 width--;
14032 }
14033 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014034 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014035 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014036 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014037 second = get_latin1_char(
14038 PyUnicode_READ(kind, pbuf, pindex + 1));
14039 pindex += 2;
14040 if (second == NULL ||
14041 _PyAccu_Accumulate(&acc, zero) ||
14042 _PyAccu_Accumulate(&acc, second))
14043 goto onError;
14044 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000014045 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014046 width -= 2;
14047 if (width < 0)
14048 width = 0;
14049 len -= 2;
14050 }
14051 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014052 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014053 if (repeat_accumulate(&acc, fillobj, width - len))
14054 goto onError;
14055 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014056 }
14057 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014058 if (sign) {
14059 assert(signobj != NULL);
14060 if (_PyAccu_Accumulate(&acc, signobj))
14061 goto onError;
14062 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014063 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014064 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14065 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014066 second = get_latin1_char(
14067 PyUnicode_READ(kind, pbuf, pindex + 1));
14068 pindex += 2;
14069 if (second == NULL ||
14070 _PyAccu_Accumulate(&acc, zero) ||
14071 _PyAccu_Accumulate(&acc, second))
14072 goto onError;
14073 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014074 }
14075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014076 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014077 if (temp != NULL) {
14078 assert(pbuf == PyUnicode_DATA(temp));
14079 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014080 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014081 else {
14082 const char *p = (const char *) pbuf;
14083 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014084 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014085 v = PyUnicode_FromKindAndData(kind, p, len);
14086 }
14087 if (v == NULL)
14088 goto onError;
14089 r = _PyAccu_Accumulate(&acc, v);
14090 Py_DECREF(v);
14091 if (r)
14092 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014093 if (width > len && repeat_accumulate(&acc, blank, width - len))
14094 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000014095 if (dict && (argidx < arglen) && c != '%') {
14096 PyErr_SetString(PyExc_TypeError,
14097 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000014098 goto onError;
14099 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014100 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000014101 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014102 } /* until end */
14103 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014104 PyErr_SetString(PyExc_TypeError,
14105 "not all arguments converted during string formatting");
14106 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014107 }
14108
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014109 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014110 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014111 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014112 }
14113 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014114 Py_XDECREF(temp);
14115 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014116 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014117
Benjamin Peterson29060642009-01-31 22:14:21 +000014118 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014119 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014120 Py_XDECREF(temp);
14121 Py_XDECREF(second);
14122 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014124 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014125 }
14126 return NULL;
14127}
14128
Jeremy Hylton938ace62002-07-17 16:30:39 +000014129static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014130unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14131
Tim Peters6d6c1a32001-08-02 04:15:00 +000014132static PyObject *
14133unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14134{
Benjamin Peterson29060642009-01-31 22:14:21 +000014135 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 static char *kwlist[] = {"object", "encoding", "errors", 0};
14137 char *encoding = NULL;
14138 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014139
Benjamin Peterson14339b62009-01-31 16:36:08 +000014140 if (type != &PyUnicode_Type)
14141 return unicode_subtype_new(type, args, kwds);
14142 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014143 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014144 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014145 if (x == NULL) {
14146 Py_INCREF(unicode_empty);
14147 return unicode_empty;
14148 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014149 if (encoding == NULL && errors == NULL)
14150 return PyObject_Str(x);
14151 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014152 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014153}
14154
Guido van Rossume023fe02001-08-30 03:12:59 +000014155static PyObject *
14156unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14157{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014158 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014159 Py_ssize_t length, char_size;
14160 int share_wstr, share_utf8;
14161 unsigned int kind;
14162 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014163
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014165
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014166 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014167 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014168 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014169 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014170 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014171 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014172 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014173 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014174
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014175 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014176 if (self == NULL) {
14177 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014178 return NULL;
14179 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014180 kind = PyUnicode_KIND(unicode);
14181 length = PyUnicode_GET_LENGTH(unicode);
14182
14183 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014184#ifdef Py_DEBUG
14185 _PyUnicode_HASH(self) = -1;
14186#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014187 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014188#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014189 _PyUnicode_STATE(self).interned = 0;
14190 _PyUnicode_STATE(self).kind = kind;
14191 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014192 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014193 _PyUnicode_STATE(self).ready = 1;
14194 _PyUnicode_WSTR(self) = NULL;
14195 _PyUnicode_UTF8_LENGTH(self) = 0;
14196 _PyUnicode_UTF8(self) = NULL;
14197 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014198 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014199
14200 share_utf8 = 0;
14201 share_wstr = 0;
14202 if (kind == PyUnicode_1BYTE_KIND) {
14203 char_size = 1;
14204 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14205 share_utf8 = 1;
14206 }
14207 else if (kind == PyUnicode_2BYTE_KIND) {
14208 char_size = 2;
14209 if (sizeof(wchar_t) == 2)
14210 share_wstr = 1;
14211 }
14212 else {
14213 assert(kind == PyUnicode_4BYTE_KIND);
14214 char_size = 4;
14215 if (sizeof(wchar_t) == 4)
14216 share_wstr = 1;
14217 }
14218
14219 /* Ensure we won't overflow the length. */
14220 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14221 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014222 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014223 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014224 data = PyObject_MALLOC((length + 1) * char_size);
14225 if (data == NULL) {
14226 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014227 goto onError;
14228 }
14229
Victor Stinnerc3c74152011-10-02 20:39:55 +020014230 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014231 if (share_utf8) {
14232 _PyUnicode_UTF8_LENGTH(self) = length;
14233 _PyUnicode_UTF8(self) = data;
14234 }
14235 if (share_wstr) {
14236 _PyUnicode_WSTR_LENGTH(self) = length;
14237 _PyUnicode_WSTR(self) = (wchar_t *)data;
14238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014239
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014240 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014241 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014242 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014243#ifdef Py_DEBUG
14244 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14245#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014246 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014247 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014248
14249onError:
14250 Py_DECREF(unicode);
14251 Py_DECREF(self);
14252 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014253}
14254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014255PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014256 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014257\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014258Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014259encoding defaults to the current default string encoding.\n\
14260errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014261
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014262static PyObject *unicode_iter(PyObject *seq);
14263
Guido van Rossumd57fd912000-03-10 22:53:23 +000014264PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014265 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014266 "str", /* tp_name */
14267 sizeof(PyUnicodeObject), /* tp_size */
14268 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014269 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014270 (destructor)unicode_dealloc, /* tp_dealloc */
14271 0, /* tp_print */
14272 0, /* tp_getattr */
14273 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014274 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014275 unicode_repr, /* tp_repr */
14276 &unicode_as_number, /* tp_as_number */
14277 &unicode_as_sequence, /* tp_as_sequence */
14278 &unicode_as_mapping, /* tp_as_mapping */
14279 (hashfunc) unicode_hash, /* tp_hash*/
14280 0, /* tp_call*/
14281 (reprfunc) unicode_str, /* tp_str */
14282 PyObject_GenericGetAttr, /* tp_getattro */
14283 0, /* tp_setattro */
14284 0, /* tp_as_buffer */
14285 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014286 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014287 unicode_doc, /* tp_doc */
14288 0, /* tp_traverse */
14289 0, /* tp_clear */
14290 PyUnicode_RichCompare, /* tp_richcompare */
14291 0, /* tp_weaklistoffset */
14292 unicode_iter, /* tp_iter */
14293 0, /* tp_iternext */
14294 unicode_methods, /* tp_methods */
14295 0, /* tp_members */
14296 0, /* tp_getset */
14297 &PyBaseObject_Type, /* tp_base */
14298 0, /* tp_dict */
14299 0, /* tp_descr_get */
14300 0, /* tp_descr_set */
14301 0, /* tp_dictoffset */
14302 0, /* tp_init */
14303 0, /* tp_alloc */
14304 unicode_new, /* tp_new */
14305 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014306};
14307
14308/* Initialize the Unicode implementation */
14309
Victor Stinner3a50e702011-10-18 21:21:00 +020014310int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014311{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014312 int i;
14313
Thomas Wouters477c8d52006-05-27 19:21:47 +000014314 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014315 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014316 0x000A, /* LINE FEED */
14317 0x000D, /* CARRIAGE RETURN */
14318 0x001C, /* FILE SEPARATOR */
14319 0x001D, /* GROUP SEPARATOR */
14320 0x001E, /* RECORD SEPARATOR */
14321 0x0085, /* NEXT LINE */
14322 0x2028, /* LINE SEPARATOR */
14323 0x2029, /* PARAGRAPH SEPARATOR */
14324 };
14325
Fred Drakee4315f52000-05-09 19:53:39 +000014326 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014327 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014328 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014329 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014330 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014332 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014333 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014334 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014335 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014336
14337 /* initialize the linebreak bloom filter */
14338 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014339 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014340 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014341
14342 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014343
14344#ifdef HAVE_MBCS
14345 winver.dwOSVersionInfoSize = sizeof(winver);
14346 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14347 PyErr_SetFromWindowsErr(0);
14348 return -1;
14349 }
14350#endif
14351 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014352}
14353
14354/* Finalize the Unicode implementation */
14355
Christian Heimesa156e092008-02-16 07:38:31 +000014356int
14357PyUnicode_ClearFreeList(void)
14358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014359 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014360}
14361
Guido van Rossumd57fd912000-03-10 22:53:23 +000014362void
Thomas Wouters78890102000-07-22 19:25:51 +000014363_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014364{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014365 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014366
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014367 Py_XDECREF(unicode_empty);
14368 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014369
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014370 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014371 if (unicode_latin1[i]) {
14372 Py_DECREF(unicode_latin1[i]);
14373 unicode_latin1[i] = NULL;
14374 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014375 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014376 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014377 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014378}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014379
Walter Dörwald16807132007-05-25 13:52:07 +000014380void
14381PyUnicode_InternInPlace(PyObject **p)
14382{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014383 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014384 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014385#ifdef Py_DEBUG
14386 assert(s != NULL);
14387 assert(_PyUnicode_CHECK(s));
14388#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014389 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014390 return;
14391#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014392 /* If it's a subclass, we don't really know what putting
14393 it in the interned dict might do. */
14394 if (!PyUnicode_CheckExact(s))
14395 return;
14396 if (PyUnicode_CHECK_INTERNED(s))
14397 return;
14398 if (interned == NULL) {
14399 interned = PyDict_New();
14400 if (interned == NULL) {
14401 PyErr_Clear(); /* Don't leave an exception */
14402 return;
14403 }
14404 }
14405 /* It might be that the GetItem call fails even
14406 though the key is present in the dictionary,
14407 namely when this happens during a stack overflow. */
14408 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014409 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014410 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014411
Benjamin Peterson29060642009-01-31 22:14:21 +000014412 if (t) {
14413 Py_INCREF(t);
14414 Py_DECREF(*p);
14415 *p = t;
14416 return;
14417 }
Walter Dörwald16807132007-05-25 13:52:07 +000014418
Benjamin Peterson14339b62009-01-31 16:36:08 +000014419 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014420 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014421 PyErr_Clear();
14422 PyThreadState_GET()->recursion_critical = 0;
14423 return;
14424 }
14425 PyThreadState_GET()->recursion_critical = 0;
14426 /* The two references in interned are not counted by refcnt.
14427 The deallocator will take care of this */
14428 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014429 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014430}
14431
14432void
14433PyUnicode_InternImmortal(PyObject **p)
14434{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014435 PyUnicode_InternInPlace(p);
14436 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014437 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014438 Py_INCREF(*p);
14439 }
Walter Dörwald16807132007-05-25 13:52:07 +000014440}
14441
14442PyObject *
14443PyUnicode_InternFromString(const char *cp)
14444{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014445 PyObject *s = PyUnicode_FromString(cp);
14446 if (s == NULL)
14447 return NULL;
14448 PyUnicode_InternInPlace(&s);
14449 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014450}
14451
Alexander Belopolsky40018472011-02-26 01:02:56 +000014452void
14453_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014454{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014455 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014456 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014457 Py_ssize_t i, n;
14458 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014459
Benjamin Peterson14339b62009-01-31 16:36:08 +000014460 if (interned == NULL || !PyDict_Check(interned))
14461 return;
14462 keys = PyDict_Keys(interned);
14463 if (keys == NULL || !PyList_Check(keys)) {
14464 PyErr_Clear();
14465 return;
14466 }
Walter Dörwald16807132007-05-25 13:52:07 +000014467
Benjamin Peterson14339b62009-01-31 16:36:08 +000014468 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14469 detector, interned unicode strings are not forcibly deallocated;
14470 rather, we give them their stolen references back, and then clear
14471 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014472
Benjamin Peterson14339b62009-01-31 16:36:08 +000014473 n = PyList_GET_SIZE(keys);
14474 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014475 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014476 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014477 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014478 if (PyUnicode_READY(s) == -1) {
14479 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014480 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014482 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014483 case SSTATE_NOT_INTERNED:
14484 /* XXX Shouldn't happen */
14485 break;
14486 case SSTATE_INTERNED_IMMORTAL:
14487 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014488 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014489 break;
14490 case SSTATE_INTERNED_MORTAL:
14491 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014492 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014493 break;
14494 default:
14495 Py_FatalError("Inconsistent interned string state.");
14496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014497 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014498 }
14499 fprintf(stderr, "total size of all interned strings: "
14500 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14501 "mortal/immortal\n", mortal_size, immortal_size);
14502 Py_DECREF(keys);
14503 PyDict_Clear(interned);
14504 Py_DECREF(interned);
14505 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014506}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014507
14508
14509/********************* Unicode Iterator **************************/
14510
14511typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014512 PyObject_HEAD
14513 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014514 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014515} unicodeiterobject;
14516
14517static void
14518unicodeiter_dealloc(unicodeiterobject *it)
14519{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014520 _PyObject_GC_UNTRACK(it);
14521 Py_XDECREF(it->it_seq);
14522 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014523}
14524
14525static int
14526unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14527{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014528 Py_VISIT(it->it_seq);
14529 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014530}
14531
14532static PyObject *
14533unicodeiter_next(unicodeiterobject *it)
14534{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014535 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014536
Benjamin Peterson14339b62009-01-31 16:36:08 +000014537 assert(it != NULL);
14538 seq = it->it_seq;
14539 if (seq == NULL)
14540 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014541 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014543 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14544 int kind = PyUnicode_KIND(seq);
14545 void *data = PyUnicode_DATA(seq);
14546 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14547 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014548 if (item != NULL)
14549 ++it->it_index;
14550 return item;
14551 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014552
Benjamin Peterson14339b62009-01-31 16:36:08 +000014553 Py_DECREF(seq);
14554 it->it_seq = NULL;
14555 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014556}
14557
14558static PyObject *
14559unicodeiter_len(unicodeiterobject *it)
14560{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014561 Py_ssize_t len = 0;
14562 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014563 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014564 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014565}
14566
14567PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14568
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014569static PyObject *
14570unicodeiter_reduce(unicodeiterobject *it)
14571{
14572 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014573 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014574 it->it_seq, it->it_index);
14575 } else {
14576 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14577 if (u == NULL)
14578 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014579 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014580 }
14581}
14582
14583PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14584
14585static PyObject *
14586unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14587{
14588 Py_ssize_t index = PyLong_AsSsize_t(state);
14589 if (index == -1 && PyErr_Occurred())
14590 return NULL;
14591 if (index < 0)
14592 index = 0;
14593 it->it_index = index;
14594 Py_RETURN_NONE;
14595}
14596
14597PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14598
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014599static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014600 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014601 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014602 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14603 reduce_doc},
14604 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14605 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014606 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014607};
14608
14609PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014610 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14611 "str_iterator", /* tp_name */
14612 sizeof(unicodeiterobject), /* tp_basicsize */
14613 0, /* tp_itemsize */
14614 /* methods */
14615 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14616 0, /* tp_print */
14617 0, /* tp_getattr */
14618 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014619 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014620 0, /* tp_repr */
14621 0, /* tp_as_number */
14622 0, /* tp_as_sequence */
14623 0, /* tp_as_mapping */
14624 0, /* tp_hash */
14625 0, /* tp_call */
14626 0, /* tp_str */
14627 PyObject_GenericGetAttr, /* tp_getattro */
14628 0, /* tp_setattro */
14629 0, /* tp_as_buffer */
14630 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14631 0, /* tp_doc */
14632 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14633 0, /* tp_clear */
14634 0, /* tp_richcompare */
14635 0, /* tp_weaklistoffset */
14636 PyObject_SelfIter, /* tp_iter */
14637 (iternextfunc)unicodeiter_next, /* tp_iternext */
14638 unicodeiter_methods, /* tp_methods */
14639 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014640};
14641
14642static PyObject *
14643unicode_iter(PyObject *seq)
14644{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014645 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014646
Benjamin Peterson14339b62009-01-31 16:36:08 +000014647 if (!PyUnicode_Check(seq)) {
14648 PyErr_BadInternalCall();
14649 return NULL;
14650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014651 if (PyUnicode_READY(seq) == -1)
14652 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014653 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14654 if (it == NULL)
14655 return NULL;
14656 it->it_index = 0;
14657 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014658 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014659 _PyObject_GC_TRACK(it);
14660 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014661}
14662
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014663
14664size_t
14665Py_UNICODE_strlen(const Py_UNICODE *u)
14666{
14667 int res = 0;
14668 while(*u++)
14669 res++;
14670 return res;
14671}
14672
14673Py_UNICODE*
14674Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14675{
14676 Py_UNICODE *u = s1;
14677 while ((*u++ = *s2++));
14678 return s1;
14679}
14680
14681Py_UNICODE*
14682Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14683{
14684 Py_UNICODE *u = s1;
14685 while ((*u++ = *s2++))
14686 if (n-- == 0)
14687 break;
14688 return s1;
14689}
14690
14691Py_UNICODE*
14692Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14693{
14694 Py_UNICODE *u1 = s1;
14695 u1 += Py_UNICODE_strlen(u1);
14696 Py_UNICODE_strcpy(u1, s2);
14697 return s1;
14698}
14699
14700int
14701Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14702{
14703 while (*s1 && *s2 && *s1 == *s2)
14704 s1++, s2++;
14705 if (*s1 && *s2)
14706 return (*s1 < *s2) ? -1 : +1;
14707 if (*s1)
14708 return 1;
14709 if (*s2)
14710 return -1;
14711 return 0;
14712}
14713
14714int
14715Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14716{
14717 register Py_UNICODE u1, u2;
14718 for (; n != 0; n--) {
14719 u1 = *s1;
14720 u2 = *s2;
14721 if (u1 != u2)
14722 return (u1 < u2) ? -1 : +1;
14723 if (u1 == '\0')
14724 return 0;
14725 s1++;
14726 s2++;
14727 }
14728 return 0;
14729}
14730
14731Py_UNICODE*
14732Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14733{
14734 const Py_UNICODE *p;
14735 for (p = s; *p; p++)
14736 if (*p == c)
14737 return (Py_UNICODE*)p;
14738 return NULL;
14739}
14740
14741Py_UNICODE*
14742Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14743{
14744 const Py_UNICODE *p;
14745 p = s + Py_UNICODE_strlen(s);
14746 while (p != s) {
14747 p--;
14748 if (*p == c)
14749 return (Py_UNICODE*)p;
14750 }
14751 return NULL;
14752}
Victor Stinner331ea922010-08-10 16:37:20 +000014753
Victor Stinner71133ff2010-09-01 23:43:53 +000014754Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014755PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014756{
Victor Stinner577db2c2011-10-11 22:12:48 +020014757 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014758 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014760 if (!PyUnicode_Check(unicode)) {
14761 PyErr_BadArgument();
14762 return NULL;
14763 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014764 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014765 if (u == NULL)
14766 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014767 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014768 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014769 PyErr_NoMemory();
14770 return NULL;
14771 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014772 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014773 size *= sizeof(Py_UNICODE);
14774 copy = PyMem_Malloc(size);
14775 if (copy == NULL) {
14776 PyErr_NoMemory();
14777 return NULL;
14778 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014779 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014780 return copy;
14781}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014782
Georg Brandl66c221e2010-10-14 07:04:07 +000014783/* A _string module, to export formatter_parser and formatter_field_name_split
14784 to the string.Formatter class implemented in Python. */
14785
14786static PyMethodDef _string_methods[] = {
14787 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14788 METH_O, PyDoc_STR("split the argument as a field name")},
14789 {"formatter_parser", (PyCFunction) formatter_parser,
14790 METH_O, PyDoc_STR("parse the argument as a format string")},
14791 {NULL, NULL}
14792};
14793
14794static struct PyModuleDef _string_module = {
14795 PyModuleDef_HEAD_INIT,
14796 "_string",
14797 PyDoc_STR("string helper module"),
14798 0,
14799 _string_methods,
14800 NULL,
14801 NULL,
14802 NULL,
14803 NULL
14804};
14805
14806PyMODINIT_FUNC
14807PyInit__string(void)
14808{
14809 return PyModule_Create(&_string_module);
14810}
14811
14812
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014813#ifdef __cplusplus
14814}
14815#endif