blob: f13a1de563aa2b82a1c70597ae4d7f48425b89b7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
Benjamin Petersonbac79492012-01-14 13:34:47 -05001266 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001267 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001268 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnerd446d8e2012-02-05 01:45:45 +01001747 id->object = unicode_fromascii((unsigned char*)id->string,
1748 strlen(id->string));
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001749 if (!id->object)
1750 return NULL;
1751 PyUnicode_InternInPlace(&id->object);
1752 assert(!id->next);
1753 id->next = static_strings;
1754 static_strings = id;
1755 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001756 return id->object;
1757}
1758
1759void
1760_PyUnicode_ClearStaticStrings()
1761{
1762 _Py_Identifier *i;
1763 for (i = static_strings; i; i = i->next) {
1764 Py_DECREF(i->object);
1765 i->object = NULL;
1766 i->next = NULL;
1767 }
1768}
1769
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001770/* Internal function, don't check maximum character */
1771
Victor Stinnere57b1c02011-09-28 22:20:48 +02001772static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001773unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001774{
Victor Stinner785938e2011-12-11 20:09:03 +01001775 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001776 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001777#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001778 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001779#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001780 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001781 }
Victor Stinner785938e2011-12-11 20:09:03 +01001782 unicode = PyUnicode_New(size, 127);
1783 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001784 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001785 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1786 assert(_PyUnicode_CheckConsistency(unicode, 1));
1787 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001788}
1789
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001790static Py_UCS4
1791kind_maxchar_limit(unsigned int kind)
1792{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001793 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001794 case PyUnicode_1BYTE_KIND:
1795 return 0x80;
1796 case PyUnicode_2BYTE_KIND:
1797 return 0x100;
1798 case PyUnicode_4BYTE_KIND:
1799 return 0x10000;
1800 default:
1801 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001802 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001803 }
1804}
1805
Victor Stinner702c7342011-10-05 13:50:52 +02001806static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001807_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001810 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001811
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001812 if (size == 0) {
1813 Py_INCREF(unicode_empty);
1814 return unicode_empty;
1815 }
1816 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001817 if (size == 1)
1818 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001819
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001820 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001821 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 if (!res)
1823 return NULL;
1824 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001825 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001827}
1828
Victor Stinnere57b1c02011-09-28 22:20:48 +02001829static PyObject*
1830_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831{
1832 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001833 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001834
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001835 if (size == 0) {
1836 Py_INCREF(unicode_empty);
1837 return unicode_empty;
1838 }
1839 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001840 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001841 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001842
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001843 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001844 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 if (!res)
1846 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001847 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001849 else {
1850 _PyUnicode_CONVERT_BYTES(
1851 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1852 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001853 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 return res;
1855}
1856
Victor Stinnere57b1c02011-09-28 22:20:48 +02001857static PyObject*
1858_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859{
1860 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001861 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001862
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001863 if (size == 0) {
1864 Py_INCREF(unicode_empty);
1865 return unicode_empty;
1866 }
1867 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001868 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001869 return get_latin1_char((unsigned char)u[0]);
1870
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001871 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001872 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 if (!res)
1874 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001875 if (max_char < 256)
1876 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1877 PyUnicode_1BYTE_DATA(res));
1878 else if (max_char < 0x10000)
1879 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1880 PyUnicode_2BYTE_DATA(res));
1881 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001882 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001883 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 return res;
1885}
1886
1887PyObject*
1888PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1889{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001890 if (size < 0) {
1891 PyErr_SetString(PyExc_ValueError, "size must be positive");
1892 return NULL;
1893 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001894 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001896 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001898 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001900 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001901 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 PyErr_SetString(PyExc_SystemError, "invalid kind");
1903 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905}
1906
Victor Stinner25a4b292011-10-06 12:31:55 +02001907/* Ensure that a string uses the most efficient storage, if it is not the
1908 case: create a new string with of the right kind. Write NULL into *p_unicode
1909 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001910static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001911unicode_adjust_maxchar(PyObject **p_unicode)
1912{
1913 PyObject *unicode, *copy;
1914 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001915 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001916 unsigned int kind;
1917
1918 assert(p_unicode != NULL);
1919 unicode = *p_unicode;
1920 assert(PyUnicode_IS_READY(unicode));
1921 if (PyUnicode_IS_ASCII(unicode))
1922 return;
1923
1924 len = PyUnicode_GET_LENGTH(unicode);
1925 kind = PyUnicode_KIND(unicode);
1926 if (kind == PyUnicode_1BYTE_KIND) {
1927 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001928 max_char = ucs1lib_find_max_char(u, u + len);
1929 if (max_char >= 128)
1930 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001931 }
1932 else if (kind == PyUnicode_2BYTE_KIND) {
1933 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001934 max_char = ucs2lib_find_max_char(u, u + len);
1935 if (max_char >= 256)
1936 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001937 }
1938 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001939 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001940 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001941 max_char = ucs4lib_find_max_char(u, u + len);
1942 if (max_char >= 0x10000)
1943 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001944 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 copy = PyUnicode_New(len, max_char);
1946 copy_characters(copy, 0, unicode, 0, len);
1947 Py_DECREF(unicode);
1948 *p_unicode = copy;
1949}
1950
Victor Stinner034f6cf2011-09-30 02:26:44 +02001951PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001952_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001953{
Victor Stinner87af4f22011-11-21 23:03:47 +01001954 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001955 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956
Victor Stinner034f6cf2011-09-30 02:26:44 +02001957 if (!PyUnicode_Check(unicode)) {
1958 PyErr_BadInternalCall();
1959 return NULL;
1960 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05001961 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001962 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001963
Victor Stinner87af4f22011-11-21 23:03:47 +01001964 length = PyUnicode_GET_LENGTH(unicode);
1965 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001966 if (!copy)
1967 return NULL;
1968 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1969
Victor Stinner87af4f22011-11-21 23:03:47 +01001970 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1971 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001973 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001974}
1975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976
Victor Stinnerbc603d12011-10-02 01:00:40 +02001977/* Widen Unicode objects to larger buffers. Don't write terminating null
1978 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979
1980void*
1981_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1982{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001983 Py_ssize_t len;
1984 void *result;
1985 unsigned int skind;
1986
Benjamin Petersonbac79492012-01-14 13:34:47 -05001987 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02001988 return NULL;
1989
1990 len = PyUnicode_GET_LENGTH(s);
1991 skind = PyUnicode_KIND(s);
1992 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001993 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 return NULL;
1995 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001996 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001997 case PyUnicode_2BYTE_KIND:
1998 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1999 if (!result)
2000 return PyErr_NoMemory();
2001 assert(skind == PyUnicode_1BYTE_KIND);
2002 _PyUnicode_CONVERT_BYTES(
2003 Py_UCS1, Py_UCS2,
2004 PyUnicode_1BYTE_DATA(s),
2005 PyUnicode_1BYTE_DATA(s) + len,
2006 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002008 case PyUnicode_4BYTE_KIND:
2009 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2010 if (!result)
2011 return PyErr_NoMemory();
2012 if (skind == PyUnicode_2BYTE_KIND) {
2013 _PyUnicode_CONVERT_BYTES(
2014 Py_UCS2, Py_UCS4,
2015 PyUnicode_2BYTE_DATA(s),
2016 PyUnicode_2BYTE_DATA(s) + len,
2017 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002019 else {
2020 assert(skind == PyUnicode_1BYTE_KIND);
2021 _PyUnicode_CONVERT_BYTES(
2022 Py_UCS1, Py_UCS4,
2023 PyUnicode_1BYTE_DATA(s),
2024 PyUnicode_1BYTE_DATA(s) + len,
2025 result);
2026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002028 default:
2029 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 }
Victor Stinner01698042011-10-04 00:04:26 +02002031 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 return NULL;
2033}
2034
2035static Py_UCS4*
2036as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2037 int copy_null)
2038{
2039 int kind;
2040 void *data;
2041 Py_ssize_t len, targetlen;
2042 if (PyUnicode_READY(string) == -1)
2043 return NULL;
2044 kind = PyUnicode_KIND(string);
2045 data = PyUnicode_DATA(string);
2046 len = PyUnicode_GET_LENGTH(string);
2047 targetlen = len;
2048 if (copy_null)
2049 targetlen++;
2050 if (!target) {
2051 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2052 PyErr_NoMemory();
2053 return NULL;
2054 }
2055 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2056 if (!target) {
2057 PyErr_NoMemory();
2058 return NULL;
2059 }
2060 }
2061 else {
2062 if (targetsize < targetlen) {
2063 PyErr_Format(PyExc_SystemError,
2064 "string is longer than the buffer");
2065 if (copy_null && 0 < targetsize)
2066 target[0] = 0;
2067 return NULL;
2068 }
2069 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002070 if (kind == PyUnicode_1BYTE_KIND) {
2071 Py_UCS1 *start = (Py_UCS1 *) data;
2072 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002073 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002074 else if (kind == PyUnicode_2BYTE_KIND) {
2075 Py_UCS2 *start = (Py_UCS2 *) data;
2076 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2077 }
2078 else {
2079 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 if (copy_null)
2083 target[len] = 0;
2084 return target;
2085}
2086
2087Py_UCS4*
2088PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2089 int copy_null)
2090{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002091 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 PyErr_BadInternalCall();
2093 return NULL;
2094 }
2095 return as_ucs4(string, target, targetsize, copy_null);
2096}
2097
2098Py_UCS4*
2099PyUnicode_AsUCS4Copy(PyObject *string)
2100{
2101 return as_ucs4(string, NULL, 0, 1);
2102}
2103
2104#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002105
Alexander Belopolsky40018472011-02-26 01:02:56 +00002106PyObject *
2107PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002110 if (size == 0) {
2111 Py_INCREF(unicode_empty);
2112 return unicode_empty;
2113 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002114 PyErr_BadInternalCall();
2115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
2117
Martin v. Löwis790465f2008-04-05 20:41:37 +00002118 if (size == -1) {
2119 size = wcslen(w);
2120 }
2121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123}
2124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002126
Walter Dörwald346737f2007-05-31 10:44:43 +00002127static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002128makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2129 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002130{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002131 *fmt++ = '%';
2132 if (width) {
2133 if (zeropad)
2134 *fmt++ = '0';
2135 fmt += sprintf(fmt, "%d", width);
2136 }
2137 if (precision)
2138 fmt += sprintf(fmt, ".%d", precision);
2139 if (longflag)
2140 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002141 else if (longlongflag) {
2142 /* longlongflag should only ever be nonzero on machines with
2143 HAVE_LONG_LONG defined */
2144#ifdef HAVE_LONG_LONG
2145 char *f = PY_FORMAT_LONG_LONG;
2146 while (*f)
2147 *fmt++ = *f++;
2148#else
2149 /* we shouldn't ever get here */
2150 assert(0);
2151 *fmt++ = 'l';
2152#endif
2153 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 else if (size_tflag) {
2155 char *f = PY_FORMAT_SIZE_T;
2156 while (*f)
2157 *fmt++ = *f++;
2158 }
2159 *fmt++ = c;
2160 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002161}
2162
Victor Stinner96865452011-03-01 23:44:09 +00002163/* helper for PyUnicode_FromFormatV() */
2164
2165static const char*
2166parse_format_flags(const char *f,
2167 int *p_width, int *p_precision,
2168 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2169{
2170 int width, precision, longflag, longlongflag, size_tflag;
2171
2172 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2173 f++;
2174 width = 0;
2175 while (Py_ISDIGIT((unsigned)*f))
2176 width = (width*10) + *f++ - '0';
2177 precision = 0;
2178 if (*f == '.') {
2179 f++;
2180 while (Py_ISDIGIT((unsigned)*f))
2181 precision = (precision*10) + *f++ - '0';
2182 if (*f == '%') {
2183 /* "%.3%s" => f points to "3" */
2184 f--;
2185 }
2186 }
2187 if (*f == '\0') {
2188 /* bogus format "%.1" => go backward, f points to "1" */
2189 f--;
2190 }
2191 if (p_width != NULL)
2192 *p_width = width;
2193 if (p_precision != NULL)
2194 *p_precision = precision;
2195
2196 /* Handle %ld, %lu, %lld and %llu. */
2197 longflag = 0;
2198 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002199 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002200
2201 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002202 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002203 longflag = 1;
2204 ++f;
2205 }
2206#ifdef HAVE_LONG_LONG
2207 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002208 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002209 longlongflag = 1;
2210 f += 2;
2211 }
2212#endif
2213 }
2214 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002215 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002216 size_tflag = 1;
2217 ++f;
2218 }
2219 if (p_longflag != NULL)
2220 *p_longflag = longflag;
2221 if (p_longlongflag != NULL)
2222 *p_longlongflag = longlongflag;
2223 if (p_size_tflag != NULL)
2224 *p_size_tflag = size_tflag;
2225 return f;
2226}
2227
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002228/* maximum number of characters required for output of %ld. 21 characters
2229 allows for 64-bit integers (in decimal) and an optional sign. */
2230#define MAX_LONG_CHARS 21
2231/* maximum number of characters required for output of %lld.
2232 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2233 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2234#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2235
Walter Dörwaldd2034312007-05-18 16:29:38 +00002236PyObject *
2237PyUnicode_FromFormatV(const char *format, va_list vargs)
2238{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002239 va_list count;
2240 Py_ssize_t callcount = 0;
2241 PyObject **callresults = NULL;
2242 PyObject **callresult = NULL;
2243 Py_ssize_t n = 0;
2244 int width = 0;
2245 int precision = 0;
2246 int zeropad;
2247 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002248 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002249 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002250 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2252 Py_UCS4 argmaxchar;
2253 Py_ssize_t numbersize = 0;
2254 char *numberresults = NULL;
2255 char *numberresult = NULL;
2256 Py_ssize_t i;
2257 int kind;
2258 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002259
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002260 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002261 /* step 1: count the number of %S/%R/%A/%s format specifications
2262 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2263 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002265 * also estimate a upper bound for all the number formats in the string,
2266 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002268 for (f = format; *f; f++) {
2269 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002270 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2272 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2273 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2274 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002277#ifdef HAVE_LONG_LONG
2278 if (longlongflag) {
2279 if (width < MAX_LONG_LONG_CHARS)
2280 width = MAX_LONG_LONG_CHARS;
2281 }
2282 else
2283#endif
2284 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2285 including sign. Decimal takes the most space. This
2286 isn't enough for octal. If a width is specified we
2287 need more (which we allocate later). */
2288 if (width < MAX_LONG_CHARS)
2289 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290
2291 /* account for the size + '\0' to separate numbers
2292 inside of the numberresults buffer */
2293 numbersize += (width + 1);
2294 }
2295 }
2296 else if ((unsigned char)*f > 127) {
2297 PyErr_Format(PyExc_ValueError,
2298 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2299 "string, got a non-ASCII byte: 0x%02x",
2300 (unsigned char)*f);
2301 return NULL;
2302 }
2303 }
2304 /* step 2: allocate memory for the results of
2305 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2306 if (callcount) {
2307 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2308 if (!callresults) {
2309 PyErr_NoMemory();
2310 return NULL;
2311 }
2312 callresult = callresults;
2313 }
2314 /* step 2.5: allocate memory for the results of formating numbers */
2315 if (numbersize) {
2316 numberresults = PyObject_Malloc(numbersize);
2317 if (!numberresults) {
2318 PyErr_NoMemory();
2319 goto fail;
2320 }
2321 numberresult = numberresults;
2322 }
2323
2324 /* step 3: format numbers and figure out how large a buffer we need */
2325 for (f = format; *f; f++) {
2326 if (*f == '%') {
2327 const char* p;
2328 int longflag;
2329 int longlongflag;
2330 int size_tflag;
2331 int numprinted;
2332
2333 p = f;
2334 zeropad = (f[1] == '0');
2335 f = parse_format_flags(f, &width, &precision,
2336 &longflag, &longlongflag, &size_tflag);
2337 switch (*f) {
2338 case 'c':
2339 {
2340 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002341 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 n++;
2343 break;
2344 }
2345 case '%':
2346 n++;
2347 break;
2348 case 'i':
2349 case 'd':
2350 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2351 width, precision, *f);
2352 if (longflag)
2353 numprinted = sprintf(numberresult, fmt,
2354 va_arg(count, long));
2355#ifdef HAVE_LONG_LONG
2356 else if (longlongflag)
2357 numprinted = sprintf(numberresult, fmt,
2358 va_arg(count, PY_LONG_LONG));
2359#endif
2360 else if (size_tflag)
2361 numprinted = sprintf(numberresult, fmt,
2362 va_arg(count, Py_ssize_t));
2363 else
2364 numprinted = sprintf(numberresult, fmt,
2365 va_arg(count, int));
2366 n += numprinted;
2367 /* advance by +1 to skip over the '\0' */
2368 numberresult += (numprinted + 1);
2369 assert(*(numberresult - 1) == '\0');
2370 assert(*(numberresult - 2) != '\0');
2371 assert(numprinted >= 0);
2372 assert(numberresult <= numberresults + numbersize);
2373 break;
2374 case 'u':
2375 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2376 width, precision, 'u');
2377 if (longflag)
2378 numprinted = sprintf(numberresult, fmt,
2379 va_arg(count, unsigned long));
2380#ifdef HAVE_LONG_LONG
2381 else if (longlongflag)
2382 numprinted = sprintf(numberresult, fmt,
2383 va_arg(count, unsigned PY_LONG_LONG));
2384#endif
2385 else if (size_tflag)
2386 numprinted = sprintf(numberresult, fmt,
2387 va_arg(count, size_t));
2388 else
2389 numprinted = sprintf(numberresult, fmt,
2390 va_arg(count, unsigned int));
2391 n += numprinted;
2392 numberresult += (numprinted + 1);
2393 assert(*(numberresult - 1) == '\0');
2394 assert(*(numberresult - 2) != '\0');
2395 assert(numprinted >= 0);
2396 assert(numberresult <= numberresults + numbersize);
2397 break;
2398 case 'x':
2399 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2400 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2401 n += numprinted;
2402 numberresult += (numprinted + 1);
2403 assert(*(numberresult - 1) == '\0');
2404 assert(*(numberresult - 2) != '\0');
2405 assert(numprinted >= 0);
2406 assert(numberresult <= numberresults + numbersize);
2407 break;
2408 case 'p':
2409 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2410 /* %p is ill-defined: ensure leading 0x. */
2411 if (numberresult[1] == 'X')
2412 numberresult[1] = 'x';
2413 else if (numberresult[1] != 'x') {
2414 memmove(numberresult + 2, numberresult,
2415 strlen(numberresult) + 1);
2416 numberresult[0] = '0';
2417 numberresult[1] = 'x';
2418 numprinted += 2;
2419 }
2420 n += numprinted;
2421 numberresult += (numprinted + 1);
2422 assert(*(numberresult - 1) == '\0');
2423 assert(*(numberresult - 2) != '\0');
2424 assert(numprinted >= 0);
2425 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002426 break;
2427 case 's':
2428 {
2429 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002430 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002431 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002432 if (!str)
2433 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 /* since PyUnicode_DecodeUTF8 returns already flexible
2435 unicode objects, there is no need to call ready on them */
2436 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002437 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002439 /* Remember the str and switch to the next slot */
2440 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002441 break;
2442 }
2443 case 'U':
2444 {
2445 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002446 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 if (PyUnicode_READY(obj) == -1)
2448 goto fail;
2449 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002450 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002452 break;
2453 }
2454 case 'V':
2455 {
2456 PyObject *obj = va_arg(count, PyObject *);
2457 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002458 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002460 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002461 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002462 if (PyUnicode_READY(obj) == -1)
2463 goto fail;
2464 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002465 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002466 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002467 *callresult++ = NULL;
2468 }
2469 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002470 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002471 if (!str_obj)
2472 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002473 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002474 Py_DECREF(str_obj);
2475 goto fail;
2476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002478 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002480 *callresult++ = str_obj;
2481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002482 break;
2483 }
2484 case 'S':
2485 {
2486 PyObject *obj = va_arg(count, PyObject *);
2487 PyObject *str;
2488 assert(obj);
2489 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002490 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002491 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002492 if (PyUnicode_READY(str) == -1) {
2493 Py_DECREF(str);
2494 goto fail;
2495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002497 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 /* Remember the str and switch to the next slot */
2500 *callresult++ = str;
2501 break;
2502 }
2503 case 'R':
2504 {
2505 PyObject *obj = va_arg(count, PyObject *);
2506 PyObject *repr;
2507 assert(obj);
2508 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002509 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002510 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002511 if (PyUnicode_READY(repr) == -1) {
2512 Py_DECREF(repr);
2513 goto fail;
2514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002516 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002518 /* Remember the repr and switch to the next slot */
2519 *callresult++ = repr;
2520 break;
2521 }
2522 case 'A':
2523 {
2524 PyObject *obj = va_arg(count, PyObject *);
2525 PyObject *ascii;
2526 assert(obj);
2527 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002528 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002529 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002530 if (PyUnicode_READY(ascii) == -1) {
2531 Py_DECREF(ascii);
2532 goto fail;
2533 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002535 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002537 /* Remember the repr and switch to the next slot */
2538 *callresult++ = ascii;
2539 break;
2540 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002541 default:
2542 /* if we stumble upon an unknown
2543 formatting code, copy the rest of
2544 the format string to the output
2545 string. (we cannot just skip the
2546 code, since there's no way to know
2547 what's in the argument list) */
2548 n += strlen(p);
2549 goto expand;
2550 }
2551 } else
2552 n++;
2553 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002554 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002555 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 we don't have to resize the string.
2558 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002559 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 if (!string)
2561 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 kind = PyUnicode_KIND(string);
2563 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002564 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002569 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002570
2571 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002572 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2573 /* checking for == because the last argument could be a empty
2574 string, which causes i to point to end, the assert at the end of
2575 the loop */
2576 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002577
Benjamin Peterson14339b62009-01-31 16:36:08 +00002578 switch (*f) {
2579 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002580 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 const int ordinal = va_arg(vargs, int);
2582 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002584 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002585 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 case 'p':
2590 /* unused, since we already have the result */
2591 if (*f == 'p')
2592 (void) va_arg(vargs, void *);
2593 else
2594 (void) va_arg(vargs, int);
2595 /* extract the result from numberresults and append. */
2596 for (; *numberresult; ++i, ++numberresult)
2597 PyUnicode_WRITE(kind, data, i, *numberresult);
2598 /* skip over the separating '\0' */
2599 assert(*numberresult == '\0');
2600 numberresult++;
2601 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002602 break;
2603 case 's':
2604 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002605 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002607 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 size = PyUnicode_GET_LENGTH(*callresult);
2609 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002610 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002612 /* We're done with the unicode()/repr() => forget it */
2613 Py_DECREF(*callresult);
2614 /* switch to next unicode()/repr() result */
2615 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 break;
2617 }
2618 case 'U':
2619 {
2620 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 Py_ssize_t size;
2622 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2623 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002624 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 break;
2627 }
2628 case 'V':
2629 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002632 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002633 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 size = PyUnicode_GET_LENGTH(obj);
2635 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002636 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 size = PyUnicode_GET_LENGTH(*callresult);
2640 assert(PyUnicode_KIND(*callresult) <=
2641 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002642 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002643 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002644 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002645 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002646 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 break;
2648 }
2649 case 'S':
2650 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002651 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002653 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 /* unused, since we already have the result */
2655 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002657 copy_characters(string, i, *callresult, 0, size);
2658 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 /* We're done with the unicode()/repr() => forget it */
2660 Py_DECREF(*callresult);
2661 /* switch to next unicode()/repr() result */
2662 ++callresult;
2663 break;
2664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002667 break;
2668 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 for (; *p; ++p, ++i)
2670 PyUnicode_WRITE(kind, data, i, *p);
2671 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 goto end;
2673 }
Victor Stinner1205f272010-09-11 00:54:47 +00002674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 else {
2676 assert(i < PyUnicode_GET_LENGTH(string));
2677 PyUnicode_WRITE(kind, data, i++, *f);
2678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002681
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 if (callresults)
2684 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 if (numberresults)
2686 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002687 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002688 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002689 if (callresults) {
2690 PyObject **callresult2 = callresults;
2691 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002692 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002693 ++callresult2;
2694 }
2695 PyObject_Free(callresults);
2696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 if (numberresults)
2698 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002700}
2701
Walter Dörwaldd2034312007-05-18 16:29:38 +00002702PyObject *
2703PyUnicode_FromFormat(const char *format, ...)
2704{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 PyObject* ret;
2706 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707
2708#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 ret = PyUnicode_FromFormatV(format, vargs);
2714 va_end(vargs);
2715 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716}
2717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718#ifdef HAVE_WCHAR_H
2719
Victor Stinner5593d8a2010-10-02 11:11:27 +00002720/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2721 convert a Unicode object to a wide character string.
2722
Victor Stinnerd88d9832011-09-06 02:00:05 +02002723 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002724 character) required to convert the unicode object. Ignore size argument.
2725
Victor Stinnerd88d9832011-09-06 02:00:05 +02002726 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002727 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002728 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002729static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002730unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002731 wchar_t *w,
2732 Py_ssize_t size)
2733{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 const wchar_t *wstr;
2736
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002737 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 if (wstr == NULL)
2739 return -1;
2740
Victor Stinner5593d8a2010-10-02 11:11:27 +00002741 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 if (size > res)
2743 size = res + 1;
2744 else
2745 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002747 return res;
2748 }
2749 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002751}
2752
2753Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002754PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002755 wchar_t *w,
2756 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757{
2758 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002759 PyErr_BadInternalCall();
2760 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002762 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763}
2764
Victor Stinner137c34c2010-09-29 10:25:54 +00002765wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002766PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002767 Py_ssize_t *size)
2768{
2769 wchar_t* buffer;
2770 Py_ssize_t buflen;
2771
2772 if (unicode == NULL) {
2773 PyErr_BadInternalCall();
2774 return NULL;
2775 }
2776
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002777 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 if (buflen == -1)
2779 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002781 PyErr_NoMemory();
2782 return NULL;
2783 }
2784
Victor Stinner137c34c2010-09-29 10:25:54 +00002785 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2786 if (buffer == NULL) {
2787 PyErr_NoMemory();
2788 return NULL;
2789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002790 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 if (buflen == -1)
2792 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002793 if (size != NULL)
2794 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002795 return buffer;
2796}
2797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799
Alexander Belopolsky40018472011-02-26 01:02:56 +00002800PyObject *
2801PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002803 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002804 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 PyErr_SetString(PyExc_ValueError,
2806 "chr() arg not in range(0x110000)");
2807 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002808 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 if (ordinal < 256)
2811 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 v = PyUnicode_New(1, ordinal);
2814 if (v == NULL)
2815 return NULL;
2816 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002817 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002819}
2820
Alexander Belopolsky40018472011-02-26 01:02:56 +00002821PyObject *
2822PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002824 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002825 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002826 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002827 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002828 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002829 Py_INCREF(obj);
2830 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002831 }
2832 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002833 /* For a Unicode subtype that's not a Unicode object,
2834 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002835 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002836 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002837 PyErr_Format(PyExc_TypeError,
2838 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002839 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002840 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002841}
2842
Alexander Belopolsky40018472011-02-26 01:02:56 +00002843PyObject *
2844PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002845 const char *encoding,
2846 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002847{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002848 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002849 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002850
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 PyErr_BadInternalCall();
2853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002855
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002856 /* Decoding bytes objects is the most common case and should be fast */
2857 if (PyBytes_Check(obj)) {
2858 if (PyBytes_GET_SIZE(obj) == 0) {
2859 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002860 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002861 }
2862 else {
2863 v = PyUnicode_Decode(
2864 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2865 encoding, errors);
2866 }
2867 return v;
2868 }
2869
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002870 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002871 PyErr_SetString(PyExc_TypeError,
2872 "decoding str is not supported");
2873 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002874 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002875
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002876 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2877 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2878 PyErr_Format(PyExc_TypeError,
2879 "coercing to str: need bytes, bytearray "
2880 "or buffer-like object, %.80s found",
2881 Py_TYPE(obj)->tp_name);
2882 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002883 }
Tim Petersced69f82003-09-16 20:30:58 +00002884
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002885 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002887 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 }
Tim Petersced69f82003-09-16 20:30:58 +00002889 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002890 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002891
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002892 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002893 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894}
2895
Victor Stinner600d3be2010-06-10 12:00:55 +00002896/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002897 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2898 1 on success. */
2899static int
2900normalize_encoding(const char *encoding,
2901 char *lower,
2902 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002904 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002905 char *l;
2906 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002908 if (encoding == NULL) {
2909 strcpy(lower, "utf-8");
2910 return 1;
2911 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002912 e = encoding;
2913 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002914 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002915 while (*e) {
2916 if (l == l_end)
2917 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002918 if (Py_ISUPPER(*e)) {
2919 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002920 }
2921 else if (*e == '_') {
2922 *l++ = '-';
2923 e++;
2924 }
2925 else {
2926 *l++ = *e++;
2927 }
2928 }
2929 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002930 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002931}
2932
Alexander Belopolsky40018472011-02-26 01:02:56 +00002933PyObject *
2934PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002935 Py_ssize_t size,
2936 const char *encoding,
2937 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002938{
2939 PyObject *buffer = NULL, *unicode;
2940 Py_buffer info;
2941 char lower[11]; /* Enough for any encoding shortcut */
2942
Fred Drakee4315f52000-05-09 19:53:39 +00002943 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002944 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002945 if ((strcmp(lower, "utf-8") == 0) ||
2946 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002947 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002948 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002949 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002950 (strcmp(lower, "iso-8859-1") == 0))
2951 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002952#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002953 else if (strcmp(lower, "mbcs") == 0)
2954 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002955#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002956 else if (strcmp(lower, "ascii") == 0)
2957 return PyUnicode_DecodeASCII(s, size, errors);
2958 else if (strcmp(lower, "utf-16") == 0)
2959 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2960 else if (strcmp(lower, "utf-32") == 0)
2961 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963
2964 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002965 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002966 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002967 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002968 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 if (buffer == NULL)
2970 goto onError;
2971 unicode = PyCodec_Decode(buffer, encoding, errors);
2972 if (unicode == NULL)
2973 goto onError;
2974 if (!PyUnicode_Check(unicode)) {
2975 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002976 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002977 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 Py_DECREF(unicode);
2979 goto onError;
2980 }
2981 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002982 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002983
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 Py_XDECREF(buffer);
2986 return NULL;
2987}
2988
Alexander Belopolsky40018472011-02-26 01:02:56 +00002989PyObject *
2990PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002991 const char *encoding,
2992 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002993{
2994 PyObject *v;
2995
2996 if (!PyUnicode_Check(unicode)) {
2997 PyErr_BadArgument();
2998 goto onError;
2999 }
3000
3001 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003002 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003003
3004 /* Decode via the codec registry */
3005 v = PyCodec_Decode(unicode, encoding, errors);
3006 if (v == NULL)
3007 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003008 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003009
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003011 return NULL;
3012}
3013
Alexander Belopolsky40018472011-02-26 01:02:56 +00003014PyObject *
3015PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003016 const char *encoding,
3017 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003018{
3019 PyObject *v;
3020
3021 if (!PyUnicode_Check(unicode)) {
3022 PyErr_BadArgument();
3023 goto onError;
3024 }
3025
3026 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003027 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003028
3029 /* Decode via the codec registry */
3030 v = PyCodec_Decode(unicode, encoding, errors);
3031 if (v == NULL)
3032 goto onError;
3033 if (!PyUnicode_Check(v)) {
3034 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003035 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003036 Py_TYPE(v)->tp_name);
3037 Py_DECREF(v);
3038 goto onError;
3039 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003040 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003041
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003043 return NULL;
3044}
3045
Alexander Belopolsky40018472011-02-26 01:02:56 +00003046PyObject *
3047PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003048 Py_ssize_t size,
3049 const char *encoding,
3050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051{
3052 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003053
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 unicode = PyUnicode_FromUnicode(s, size);
3055 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3058 Py_DECREF(unicode);
3059 return v;
3060}
3061
Alexander Belopolsky40018472011-02-26 01:02:56 +00003062PyObject *
3063PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003064 const char *encoding,
3065 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003066{
3067 PyObject *v;
3068
3069 if (!PyUnicode_Check(unicode)) {
3070 PyErr_BadArgument();
3071 goto onError;
3072 }
3073
3074 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003075 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076
3077 /* Encode via the codec registry */
3078 v = PyCodec_Encode(unicode, encoding, errors);
3079 if (v == NULL)
3080 goto onError;
3081 return v;
3082
Benjamin Peterson29060642009-01-31 22:14:21 +00003083 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003084 return NULL;
3085}
3086
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003087static size_t
3088wcstombs_errorpos(const wchar_t *wstr)
3089{
3090 size_t len;
3091#if SIZEOF_WCHAR_T == 2
3092 wchar_t buf[3];
3093#else
3094 wchar_t buf[2];
3095#endif
3096 char outbuf[MB_LEN_MAX];
3097 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003098
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003099#if SIZEOF_WCHAR_T == 2
3100 buf[2] = 0;
3101#else
3102 buf[1] = 0;
3103#endif
3104 start = wstr;
3105 while (*wstr != L'\0')
3106 {
3107 previous = wstr;
3108#if SIZEOF_WCHAR_T == 2
3109 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3110 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3111 {
3112 buf[0] = wstr[0];
3113 buf[1] = wstr[1];
3114 wstr += 2;
3115 }
3116 else {
3117 buf[0] = *wstr;
3118 buf[1] = 0;
3119 wstr++;
3120 }
3121#else
3122 buf[0] = *wstr;
3123 wstr++;
3124#endif
3125 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003126 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003128 }
3129
3130 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003131 return 0;
3132}
3133
Victor Stinner1b579672011-12-17 05:47:23 +01003134static int
3135locale_error_handler(const char *errors, int *surrogateescape)
3136{
3137 if (errors == NULL) {
3138 *surrogateescape = 0;
3139 return 0;
3140 }
3141
3142 if (strcmp(errors, "strict") == 0) {
3143 *surrogateescape = 0;
3144 return 0;
3145 }
3146 if (strcmp(errors, "surrogateescape") == 0) {
3147 *surrogateescape = 1;
3148 return 0;
3149 }
3150 PyErr_Format(PyExc_ValueError,
3151 "only 'strict' and 'surrogateescape' error handlers "
3152 "are supported, not '%s'",
3153 errors);
3154 return -1;
3155}
3156
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003157PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003158PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003159{
3160 Py_ssize_t wlen, wlen2;
3161 wchar_t *wstr;
3162 PyObject *bytes = NULL;
3163 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003164 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003165 PyObject *exc;
3166 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003167 int surrogateescape;
3168
3169 if (locale_error_handler(errors, &surrogateescape) < 0)
3170 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003171
3172 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3173 if (wstr == NULL)
3174 return NULL;
3175
3176 wlen2 = wcslen(wstr);
3177 if (wlen2 != wlen) {
3178 PyMem_Free(wstr);
3179 PyErr_SetString(PyExc_TypeError, "embedded null character");
3180 return NULL;
3181 }
3182
3183 if (surrogateescape) {
3184 /* locale encoding with surrogateescape */
3185 char *str;
3186
3187 str = _Py_wchar2char(wstr, &error_pos);
3188 if (str == NULL) {
3189 if (error_pos == (size_t)-1) {
3190 PyErr_NoMemory();
3191 PyMem_Free(wstr);
3192 return NULL;
3193 }
3194 else {
3195 goto encode_error;
3196 }
3197 }
3198 PyMem_Free(wstr);
3199
3200 bytes = PyBytes_FromString(str);
3201 PyMem_Free(str);
3202 }
3203 else {
3204 size_t len, len2;
3205
3206 len = wcstombs(NULL, wstr, 0);
3207 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003208 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003209 goto encode_error;
3210 }
3211
3212 bytes = PyBytes_FromStringAndSize(NULL, len);
3213 if (bytes == NULL) {
3214 PyMem_Free(wstr);
3215 return NULL;
3216 }
3217
3218 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3219 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003220 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003221 goto encode_error;
3222 }
3223 PyMem_Free(wstr);
3224 }
3225 return bytes;
3226
3227encode_error:
3228 errmsg = strerror(errno);
3229 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003230
3231 if (error_pos == (size_t)-1)
3232 error_pos = wcstombs_errorpos(wstr);
3233
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234 PyMem_Free(wstr);
3235 Py_XDECREF(bytes);
3236
Victor Stinner2f197072011-12-17 07:08:30 +01003237 if (errmsg != NULL) {
3238 size_t errlen;
3239 wstr = _Py_char2wchar(errmsg, &errlen);
3240 if (wstr != NULL) {
3241 reason = PyUnicode_FromWideChar(wstr, errlen);
3242 PyMem_Free(wstr);
3243 } else
3244 errmsg = NULL;
3245 }
3246 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003247 reason = PyUnicode_FromString(
3248 "wcstombs() encountered an unencodable "
3249 "wide character");
3250 if (reason == NULL)
3251 return NULL;
3252
3253 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3254 "locale", unicode,
3255 (Py_ssize_t)error_pos,
3256 (Py_ssize_t)(error_pos+1),
3257 reason);
3258 Py_DECREF(reason);
3259 if (exc != NULL) {
3260 PyCodec_StrictErrors(exc);
3261 Py_XDECREF(exc);
3262 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003263 return NULL;
3264}
3265
Victor Stinnerad158722010-10-27 00:25:46 +00003266PyObject *
3267PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003268{
Victor Stinner99b95382011-07-04 14:23:54 +02003269#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003270 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003271#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003272 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003273#else
Victor Stinner793b5312011-04-27 00:24:21 +02003274 PyInterpreterState *interp = PyThreadState_GET()->interp;
3275 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3276 cannot use it to encode and decode filenames before it is loaded. Load
3277 the Python codec requires to encode at least its own filename. Use the C
3278 version of the locale codec until the codec registry is initialized and
3279 the Python codec is loaded.
3280
3281 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3282 cannot only rely on it: check also interp->fscodec_initialized for
3283 subinterpreters. */
3284 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003285 return PyUnicode_AsEncodedString(unicode,
3286 Py_FileSystemDefaultEncoding,
3287 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003288 }
3289 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003290 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003291 }
Victor Stinnerad158722010-10-27 00:25:46 +00003292#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003293}
3294
Alexander Belopolsky40018472011-02-26 01:02:56 +00003295PyObject *
3296PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003297 const char *encoding,
3298 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299{
3300 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003301 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 if (!PyUnicode_Check(unicode)) {
3304 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 }
Fred Drakee4315f52000-05-09 19:53:39 +00003307
Fred Drakee4315f52000-05-09 19:53:39 +00003308 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003309 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003310 if ((strcmp(lower, "utf-8") == 0) ||
3311 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003312 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003313 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003314 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003315 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003316 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003317 }
Victor Stinner37296e82010-06-10 13:36:23 +00003318 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003319 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003320 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003321 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003322#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003323 else if (strcmp(lower, "mbcs") == 0)
3324 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003325#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003326 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003327 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329
3330 /* Encode via the codec registry */
3331 v = PyCodec_Encode(unicode, encoding, errors);
3332 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003333 return NULL;
3334
3335 /* The normal path */
3336 if (PyBytes_Check(v))
3337 return v;
3338
3339 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003340 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003341 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003342 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003343
3344 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3345 "encoder %s returned bytearray instead of bytes",
3346 encoding);
3347 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003348 Py_DECREF(v);
3349 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003350 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003351
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003352 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3353 Py_DECREF(v);
3354 return b;
3355 }
3356
3357 PyErr_Format(PyExc_TypeError,
3358 "encoder did not return a bytes object (type=%.400s)",
3359 Py_TYPE(v)->tp_name);
3360 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003361 return NULL;
3362}
3363
Alexander Belopolsky40018472011-02-26 01:02:56 +00003364PyObject *
3365PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003366 const char *encoding,
3367 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003368{
3369 PyObject *v;
3370
3371 if (!PyUnicode_Check(unicode)) {
3372 PyErr_BadArgument();
3373 goto onError;
3374 }
3375
3376 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003378
3379 /* Encode via the codec registry */
3380 v = PyCodec_Encode(unicode, encoding, errors);
3381 if (v == NULL)
3382 goto onError;
3383 if (!PyUnicode_Check(v)) {
3384 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003385 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003386 Py_TYPE(v)->tp_name);
3387 Py_DECREF(v);
3388 goto onError;
3389 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003391
Benjamin Peterson29060642009-01-31 22:14:21 +00003392 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 return NULL;
3394}
3395
Victor Stinner2f197072011-12-17 07:08:30 +01003396static size_t
3397mbstowcs_errorpos(const char *str, size_t len)
3398{
3399#ifdef HAVE_MBRTOWC
3400 const char *start = str;
3401 mbstate_t mbs;
3402 size_t converted;
3403 wchar_t ch;
3404
3405 memset(&mbs, 0, sizeof mbs);
3406 while (len)
3407 {
3408 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3409 if (converted == 0)
3410 /* Reached end of string */
3411 break;
3412 if (converted == (size_t)-1 || converted == (size_t)-2) {
3413 /* Conversion error or incomplete character */
3414 return str - start;
3415 }
3416 else {
3417 str += converted;
3418 len -= converted;
3419 }
3420 }
3421 /* failed to find the undecodable byte sequence */
3422 return 0;
3423#endif
3424 return 0;
3425}
3426
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003427PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003428PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003429 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003430{
3431 wchar_t smallbuf[256];
3432 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3433 wchar_t *wstr;
3434 size_t wlen, wlen2;
3435 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003436 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003437 size_t error_pos;
3438 char *errmsg;
3439 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003440
3441 if (locale_error_handler(errors, &surrogateescape) < 0)
3442 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003443
3444 if (str[len] != '\0' || len != strlen(str)) {
3445 PyErr_SetString(PyExc_TypeError, "embedded null character");
3446 return NULL;
3447 }
3448
3449 if (surrogateescape)
3450 {
3451 wstr = _Py_char2wchar(str, &wlen);
3452 if (wstr == NULL) {
3453 if (wlen == (size_t)-1)
3454 PyErr_NoMemory();
3455 else
3456 PyErr_SetFromErrno(PyExc_OSError);
3457 return NULL;
3458 }
3459
3460 unicode = PyUnicode_FromWideChar(wstr, wlen);
3461 PyMem_Free(wstr);
3462 }
3463 else {
3464#ifndef HAVE_BROKEN_MBSTOWCS
3465 wlen = mbstowcs(NULL, str, 0);
3466#else
3467 wlen = len;
3468#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003469 if (wlen == (size_t)-1)
3470 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003471 if (wlen+1 <= smallbuf_len) {
3472 wstr = smallbuf;
3473 }
3474 else {
3475 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3476 return PyErr_NoMemory();
3477
3478 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3479 if (!wstr)
3480 return PyErr_NoMemory();
3481 }
3482
3483 /* This shouldn't fail now */
3484 wlen2 = mbstowcs(wstr, str, wlen+1);
3485 if (wlen2 == (size_t)-1) {
3486 if (wstr != smallbuf)
3487 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003488 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003489 }
3490#ifdef HAVE_BROKEN_MBSTOWCS
3491 assert(wlen2 == wlen);
3492#endif
3493 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3494 if (wstr != smallbuf)
3495 PyMem_Free(wstr);
3496 }
3497 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003498
3499decode_error:
3500 errmsg = strerror(errno);
3501 assert(errmsg != NULL);
3502
3503 error_pos = mbstowcs_errorpos(str, len);
3504 if (errmsg != NULL) {
3505 size_t errlen;
3506 wstr = _Py_char2wchar(errmsg, &errlen);
3507 if (wstr != NULL) {
3508 reason = PyUnicode_FromWideChar(wstr, errlen);
3509 PyMem_Free(wstr);
3510 } else
3511 errmsg = NULL;
3512 }
3513 if (errmsg == NULL)
3514 reason = PyUnicode_FromString(
3515 "mbstowcs() encountered an invalid multibyte sequence");
3516 if (reason == NULL)
3517 return NULL;
3518
3519 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3520 "locale", str, len,
3521 (Py_ssize_t)error_pos,
3522 (Py_ssize_t)(error_pos+1),
3523 reason);
3524 Py_DECREF(reason);
3525 if (exc != NULL) {
3526 PyCodec_StrictErrors(exc);
3527 Py_XDECREF(exc);
3528 }
3529 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003530}
3531
3532PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003533PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003534{
3535 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003536 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003537}
3538
3539
3540PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003541PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003542 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003543 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3544}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003545
Christian Heimes5894ba72007-11-04 11:43:14 +00003546PyObject*
3547PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3548{
Victor Stinner99b95382011-07-04 14:23:54 +02003549#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003550 return PyUnicode_DecodeMBCS(s, size, NULL);
3551#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003552 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003553#else
Victor Stinner793b5312011-04-27 00:24:21 +02003554 PyInterpreterState *interp = PyThreadState_GET()->interp;
3555 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3556 cannot use it to encode and decode filenames before it is loaded. Load
3557 the Python codec requires to encode at least its own filename. Use the C
3558 version of the locale codec until the codec registry is initialized and
3559 the Python codec is loaded.
3560
3561 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3562 cannot only rely on it: check also interp->fscodec_initialized for
3563 subinterpreters. */
3564 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003565 return PyUnicode_Decode(s, size,
3566 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003567 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003568 }
3569 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003570 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003571 }
Victor Stinnerad158722010-10-27 00:25:46 +00003572#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003573}
3574
Martin v. Löwis011e8422009-05-05 04:43:17 +00003575
3576int
Antoine Pitrou13348842012-01-29 18:36:34 +01003577_PyUnicode_HasNULChars(PyObject* s)
3578{
3579 static PyObject *nul = NULL;
3580
3581 if (nul == NULL)
3582 nul = PyUnicode_FromStringAndSize("\0", 1);
3583 if (nul == NULL)
3584 return -1;
3585 return PyUnicode_Contains(s, nul);
3586}
3587
3588
3589int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003590PyUnicode_FSConverter(PyObject* arg, void* addr)
3591{
3592 PyObject *output = NULL;
3593 Py_ssize_t size;
3594 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003595 if (arg == NULL) {
3596 Py_DECREF(*(PyObject**)addr);
3597 return 1;
3598 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003599 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003600 output = arg;
3601 Py_INCREF(output);
3602 }
3603 else {
3604 arg = PyUnicode_FromObject(arg);
3605 if (!arg)
3606 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003607 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003608 Py_DECREF(arg);
3609 if (!output)
3610 return 0;
3611 if (!PyBytes_Check(output)) {
3612 Py_DECREF(output);
3613 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3614 return 0;
3615 }
3616 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003617 size = PyBytes_GET_SIZE(output);
3618 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003619 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003620 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003621 Py_DECREF(output);
3622 return 0;
3623 }
3624 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003625 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003626}
3627
3628
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003629int
3630PyUnicode_FSDecoder(PyObject* arg, void* addr)
3631{
3632 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003633 if (arg == NULL) {
3634 Py_DECREF(*(PyObject**)addr);
3635 return 1;
3636 }
3637 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003638 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003639 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003640 output = arg;
3641 Py_INCREF(output);
3642 }
3643 else {
3644 arg = PyBytes_FromObject(arg);
3645 if (!arg)
3646 return 0;
3647 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3648 PyBytes_GET_SIZE(arg));
3649 Py_DECREF(arg);
3650 if (!output)
3651 return 0;
3652 if (!PyUnicode_Check(output)) {
3653 Py_DECREF(output);
3654 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3655 return 0;
3656 }
3657 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003658 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003659 Py_DECREF(output);
3660 return 0;
3661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003662 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003663 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003664 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3665 Py_DECREF(output);
3666 return 0;
3667 }
3668 *(PyObject**)addr = output;
3669 return Py_CLEANUP_SUPPORTED;
3670}
3671
3672
Martin v. Löwis5b222132007-06-10 09:51:05 +00003673char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003674PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003675{
Christian Heimesf3863112007-11-22 07:46:41 +00003676 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003677
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003678 if (!PyUnicode_Check(unicode)) {
3679 PyErr_BadArgument();
3680 return NULL;
3681 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003682 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003683 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003684
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003685 if (PyUnicode_UTF8(unicode) == NULL) {
3686 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3688 if (bytes == NULL)
3689 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003690 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3691 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003692 Py_DECREF(bytes);
3693 return NULL;
3694 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003695 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3696 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3697 PyBytes_AS_STRING(bytes),
3698 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003699 Py_DECREF(bytes);
3700 }
3701
3702 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003703 *psize = PyUnicode_UTF8_LENGTH(unicode);
3704 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003705}
3706
3707char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003708PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003709{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003710 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3711}
3712
3713#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003714static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003715#endif
3716
3717
3718Py_UNICODE *
3719PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3720{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721 const unsigned char *one_byte;
3722#if SIZEOF_WCHAR_T == 4
3723 const Py_UCS2 *two_bytes;
3724#else
3725 const Py_UCS4 *four_bytes;
3726 const Py_UCS4 *ucs4_end;
3727 Py_ssize_t num_surrogates;
3728#endif
3729 wchar_t *w;
3730 wchar_t *wchar_end;
3731
3732 if (!PyUnicode_Check(unicode)) {
3733 PyErr_BadArgument();
3734 return NULL;
3735 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003736 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003738 assert(_PyUnicode_KIND(unicode) != 0);
3739 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740
3741#ifdef Py_DEBUG
3742 ++unicode_as_unicode_calls;
3743#endif
3744
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003745 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003746#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003747 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3748 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749 num_surrogates = 0;
3750
3751 for (; four_bytes < ucs4_end; ++four_bytes) {
3752 if (*four_bytes > 0xFFFF)
3753 ++num_surrogates;
3754 }
3755
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003756 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3757 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3758 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759 PyErr_NoMemory();
3760 return NULL;
3761 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003762 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003764 w = _PyUnicode_WSTR(unicode);
3765 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3766 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3768 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003769 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003771 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3772 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 }
3774 else
3775 *w = *four_bytes;
3776
3777 if (w > wchar_end) {
3778 assert(0 && "Miscalculated string end");
3779 }
3780 }
3781 *w = 0;
3782#else
3783 /* sizeof(wchar_t) == 4 */
3784 Py_FatalError("Impossible unicode object state, wstr and str "
3785 "should share memory already.");
3786 return NULL;
3787#endif
3788 }
3789 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003790 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3791 (_PyUnicode_LENGTH(unicode) + 1));
3792 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 PyErr_NoMemory();
3794 return NULL;
3795 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003796 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3797 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3798 w = _PyUnicode_WSTR(unicode);
3799 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003801 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3802 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803 for (; w < wchar_end; ++one_byte, ++w)
3804 *w = *one_byte;
3805 /* null-terminate the wstr */
3806 *w = 0;
3807 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003808 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 for (; w < wchar_end; ++two_bytes, ++w)
3812 *w = *two_bytes;
3813 /* null-terminate the wstr */
3814 *w = 0;
3815#else
3816 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 PyObject_FREE(_PyUnicode_WSTR(unicode));
3818 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819 Py_FatalError("Impossible unicode object state, wstr "
3820 "and str should share memory already.");
3821 return NULL;
3822#endif
3823 }
3824 else {
3825 assert(0 && "This should never happen.");
3826 }
3827 }
3828 }
3829 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 *size = PyUnicode_WSTR_LENGTH(unicode);
3831 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003832}
3833
Alexander Belopolsky40018472011-02-26 01:02:56 +00003834Py_UNICODE *
3835PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838}
3839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840
Alexander Belopolsky40018472011-02-26 01:02:56 +00003841Py_ssize_t
3842PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843{
3844 if (!PyUnicode_Check(unicode)) {
3845 PyErr_BadArgument();
3846 goto onError;
3847 }
3848 return PyUnicode_GET_SIZE(unicode);
3849
Benjamin Peterson29060642009-01-31 22:14:21 +00003850 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851 return -1;
3852}
3853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854Py_ssize_t
3855PyUnicode_GetLength(PyObject *unicode)
3856{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003857 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003858 PyErr_BadArgument();
3859 return -1;
3860 }
3861
3862 return PyUnicode_GET_LENGTH(unicode);
3863}
3864
3865Py_UCS4
3866PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3867{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003868 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3869 PyErr_BadArgument();
3870 return (Py_UCS4)-1;
3871 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003872 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003873 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 return (Py_UCS4)-1;
3875 }
3876 return PyUnicode_READ_CHAR(unicode, index);
3877}
3878
3879int
3880PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3881{
3882 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003883 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 return -1;
3885 }
Victor Stinner488fa492011-12-12 00:01:39 +01003886 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003887 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003888 PyErr_SetString(PyExc_IndexError, "string index out of range");
3889 return -1;
3890 }
Victor Stinner488fa492011-12-12 00:01:39 +01003891 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003892 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3894 index, ch);
3895 return 0;
3896}
3897
Alexander Belopolsky40018472011-02-26 01:02:56 +00003898const char *
3899PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003900{
Victor Stinner42cb4622010-09-01 19:39:01 +00003901 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003902}
3903
Victor Stinner554f3f02010-06-16 23:33:54 +00003904/* create or adjust a UnicodeDecodeError */
3905static void
3906make_decode_exception(PyObject **exceptionObject,
3907 const char *encoding,
3908 const char *input, Py_ssize_t length,
3909 Py_ssize_t startpos, Py_ssize_t endpos,
3910 const char *reason)
3911{
3912 if (*exceptionObject == NULL) {
3913 *exceptionObject = PyUnicodeDecodeError_Create(
3914 encoding, input, length, startpos, endpos, reason);
3915 }
3916 else {
3917 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3918 goto onError;
3919 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3920 goto onError;
3921 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3922 goto onError;
3923 }
3924 return;
3925
3926onError:
3927 Py_DECREF(*exceptionObject);
3928 *exceptionObject = NULL;
3929}
3930
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931/* error handling callback helper:
3932 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003933 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 and adjust various state variables.
3935 return 0 on success, -1 on error
3936*/
3937
Alexander Belopolsky40018472011-02-26 01:02:56 +00003938static int
3939unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003940 const char *encoding, const char *reason,
3941 const char **input, const char **inend, Py_ssize_t *startinpos,
3942 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003943 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003945 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946
3947 PyObject *restuple = NULL;
3948 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003949 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003950 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003951 Py_ssize_t requiredsize;
3952 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003953 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 int res = -1;
3955
Victor Stinner596a6c42011-11-09 00:02:18 +01003956 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3957 outsize = PyUnicode_GET_LENGTH(*output);
3958 else
3959 outsize = _PyUnicode_WSTR_LENGTH(*output);
3960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003962 *errorHandler = PyCodec_LookupError(errors);
3963 if (*errorHandler == NULL)
3964 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 }
3966
Victor Stinner554f3f02010-06-16 23:33:54 +00003967 make_decode_exception(exceptionObject,
3968 encoding,
3969 *input, *inend - *input,
3970 *startinpos, *endinpos,
3971 reason);
3972 if (*exceptionObject == NULL)
3973 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974
3975 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3976 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003979 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 }
3982 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003984 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003985 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003986
3987 /* Copy back the bytes variables, which might have been modified by the
3988 callback */
3989 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3990 if (!inputobj)
3991 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003992 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003994 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003995 *input = PyBytes_AS_STRING(inputobj);
3996 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003997 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003998 /* we can DECREF safely, as the exception has another reference,
3999 so the object won't go away. */
4000 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004001
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004004 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004005 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4006 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004007 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008
Victor Stinner596a6c42011-11-09 00:02:18 +01004009 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4010 /* need more space? (at least enough for what we
4011 have+the replacement+the rest of the string (starting
4012 at the new input position), so we won't have to check space
4013 when there are no errors in the rest of the string) */
4014 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4015 requiredsize = *outpos + replen + insize-newpos;
4016 if (requiredsize > outsize) {
4017 if (requiredsize<2*outsize)
4018 requiredsize = 2*outsize;
4019 if (unicode_resize(output, requiredsize) < 0)
4020 goto onError;
4021 }
4022 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004024 copy_characters(*output, *outpos, repunicode, 0, replen);
4025 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004027 else {
4028 wchar_t *repwstr;
4029 Py_ssize_t repwlen;
4030 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4031 if (repwstr == NULL)
4032 goto onError;
4033 /* need more space? (at least enough for what we
4034 have+the replacement+the rest of the string (starting
4035 at the new input position), so we won't have to check space
4036 when there are no errors in the rest of the string) */
4037 requiredsize = *outpos + repwlen + insize-newpos;
4038 if (requiredsize > outsize) {
4039 if (requiredsize < 2*outsize)
4040 requiredsize = 2*outsize;
4041 if (unicode_resize(output, requiredsize) < 0)
4042 goto onError;
4043 }
4044 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4045 *outpos += repwlen;
4046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004048 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004049
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 /* we made it! */
4051 res = 0;
4052
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 Py_XDECREF(restuple);
4055 return res;
4056}
4057
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004058/* --- UTF-7 Codec -------------------------------------------------------- */
4059
Antoine Pitrou244651a2009-05-04 18:56:13 +00004060/* See RFC2152 for details. We encode conservatively and decode liberally. */
4061
4062/* Three simple macros defining base-64. */
4063
4064/* Is c a base-64 character? */
4065
4066#define IS_BASE64(c) \
4067 (((c) >= 'A' && (c) <= 'Z') || \
4068 ((c) >= 'a' && (c) <= 'z') || \
4069 ((c) >= '0' && (c) <= '9') || \
4070 (c) == '+' || (c) == '/')
4071
4072/* given that c is a base-64 character, what is its base-64 value? */
4073
4074#define FROM_BASE64(c) \
4075 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4076 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4077 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4078 (c) == '+' ? 62 : 63)
4079
4080/* What is the base-64 character of the bottom 6 bits of n? */
4081
4082#define TO_BASE64(n) \
4083 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4084
4085/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4086 * decoded as itself. We are permissive on decoding; the only ASCII
4087 * byte not decoding to itself is the + which begins a base64
4088 * string. */
4089
4090#define DECODE_DIRECT(c) \
4091 ((c) <= 127 && (c) != '+')
4092
4093/* The UTF-7 encoder treats ASCII characters differently according to
4094 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4095 * the above). See RFC2152. This array identifies these different
4096 * sets:
4097 * 0 : "Set D"
4098 * alphanumeric and '(),-./:?
4099 * 1 : "Set O"
4100 * !"#$%&*;<=>@[]^_`{|}
4101 * 2 : "whitespace"
4102 * ht nl cr sp
4103 * 3 : special (must be base64 encoded)
4104 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4105 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004106
Tim Petersced69f82003-09-16 20:30:58 +00004107static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004108char utf7_category[128] = {
4109/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4110 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4111/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4112 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4113/* sp ! " # $ % & ' ( ) * + , - . / */
4114 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4115/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4117/* @ A B C D E F G H I J K L M N O */
4118 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4119/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4121/* ` a b c d e f g h i j k l m n o */
4122 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4123/* p q r s t u v w x y z { | } ~ del */
4124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004125};
4126
Antoine Pitrou244651a2009-05-04 18:56:13 +00004127/* ENCODE_DIRECT: this character should be encoded as itself. The
4128 * answer depends on whether we are encoding set O as itself, and also
4129 * on whether we are encoding whitespace as itself. RFC2152 makes it
4130 * clear that the answers to these questions vary between
4131 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004132
Antoine Pitrou244651a2009-05-04 18:56:13 +00004133#define ENCODE_DIRECT(c, directO, directWS) \
4134 ((c) < 128 && (c) > 0 && \
4135 ((utf7_category[(c)] == 0) || \
4136 (directWS && (utf7_category[(c)] == 2)) || \
4137 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004138
Alexander Belopolsky40018472011-02-26 01:02:56 +00004139PyObject *
4140PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004141 Py_ssize_t size,
4142 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004143{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004144 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4145}
4146
Antoine Pitrou244651a2009-05-04 18:56:13 +00004147/* The decoder. The only state we preserve is our read position,
4148 * i.e. how many characters we have consumed. So if we end in the
4149 * middle of a shift sequence we have to back off the read position
4150 * and the output to the beginning of the sequence, otherwise we lose
4151 * all the shift state (seen bits, number of bits seen, high
4152 * surrogate). */
4153
Alexander Belopolsky40018472011-02-26 01:02:56 +00004154PyObject *
4155PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004156 Py_ssize_t size,
4157 const char *errors,
4158 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004159{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004161 Py_ssize_t startinpos;
4162 Py_ssize_t endinpos;
4163 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004164 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004165 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004166 const char *errmsg = "";
4167 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004168 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004169 unsigned int base64bits = 0;
4170 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004171 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 PyObject *errorHandler = NULL;
4173 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004174
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004175 /* Start off assuming it's all ASCII. Widen later as necessary. */
4176 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004177 if (!unicode)
4178 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004179 if (size == 0) {
4180 if (consumed)
4181 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004182 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004183 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004184
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004185 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004186 e = s + size;
4187
4188 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004189 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004191 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004192
Antoine Pitrou244651a2009-05-04 18:56:13 +00004193 if (inShift) { /* in a base-64 section */
4194 if (IS_BASE64(ch)) { /* consume a base-64 character */
4195 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4196 base64bits += 6;
4197 s++;
4198 if (base64bits >= 16) {
4199 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004200 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004201 base64bits -= 16;
4202 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4203 if (surrogate) {
4204 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004205 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4206 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004207 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4208 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004209 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004210 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004211 }
4212 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004213 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4214 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004215 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004216 }
4217 }
Victor Stinner551ac952011-11-29 22:58:13 +01004218 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219 /* first surrogate */
4220 surrogate = outCh;
4221 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004222 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004223 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4224 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004225 }
4226 }
4227 }
4228 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004229 inShift = 0;
4230 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004231 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004232 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4233 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004234 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004235 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004236 if (base64bits > 0) { /* left-over bits */
4237 if (base64bits >= 6) {
4238 /* We've seen at least one base-64 character */
4239 errmsg = "partial character in shift sequence";
4240 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004242 else {
4243 /* Some bits remain; they should be zero */
4244 if (base64buffer != 0) {
4245 errmsg = "non-zero padding bits in shift sequence";
4246 goto utf7Error;
4247 }
4248 }
4249 }
4250 if (ch != '-') {
4251 /* '-' is absorbed; other terminating
4252 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004253 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4254 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004255 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256 }
4257 }
4258 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004260 s++; /* consume '+' */
4261 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004262 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004263 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4264 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004265 }
4266 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004267 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004268 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270 }
4271 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004273 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4274 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004275 s++;
4276 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277 else {
4278 startinpos = s-starts;
4279 s++;
4280 errmsg = "unexpected special character";
4281 goto utf7Error;
4282 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004283 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004284utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 endinpos = s-starts;
4286 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 errors, &errorHandler,
4288 "utf7", errmsg,
4289 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004290 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292 }
4293
Antoine Pitrou244651a2009-05-04 18:56:13 +00004294 /* end of string */
4295
4296 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4297 /* if we're in an inconsistent state, that's an error */
4298 if (surrogate ||
4299 (base64bits >= 6) ||
4300 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301 endinpos = size;
4302 if (unicode_decode_call_errorhandler(
4303 errors, &errorHandler,
4304 "utf7", "unterminated shift sequence",
4305 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004306 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307 goto onError;
4308 if (s < e)
4309 goto restart;
4310 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004311 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004312
4313 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004314 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004316 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 }
4319 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004320 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004322 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004324 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325 goto onError;
4326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 Py_XDECREF(errorHandler);
4328 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004329 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 Py_XDECREF(errorHandler);
4333 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 Py_DECREF(unicode);
4335 return NULL;
4336}
4337
4338
Alexander Belopolsky40018472011-02-26 01:02:56 +00004339PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004340_PyUnicode_EncodeUTF7(PyObject *str,
4341 int base64SetO,
4342 int base64WhiteSpace,
4343 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004344{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004345 int kind;
4346 void *data;
4347 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004348 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004349 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004351 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 unsigned int base64bits = 0;
4353 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004354 char * out;
4355 char * start;
4356
Benjamin Petersonbac79492012-01-14 13:34:47 -05004357 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004358 return NULL;
4359 kind = PyUnicode_KIND(str);
4360 data = PyUnicode_DATA(str);
4361 len = PyUnicode_GET_LENGTH(str);
4362
4363 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004364 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004366 /* It might be possible to tighten this worst case */
4367 allocated = 8 * len;
4368 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004369 return PyErr_NoMemory();
4370
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 if (v == NULL)
4373 return NULL;
4374
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004375 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004376 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004377 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (inShift) {
4380 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4381 /* shifting out */
4382 if (base64bits) { /* output remaining bits */
4383 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4384 base64buffer = 0;
4385 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 }
4387 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 /* Characters not in the BASE64 set implicitly unshift the sequence
4389 so no '-' is required, except if the character is itself a '-' */
4390 if (IS_BASE64(ch) || ch == '-') {
4391 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004392 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 *out++ = (char) ch;
4394 }
4395 else {
4396 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004397 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004398 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 else { /* not in a shift sequence */
4400 if (ch == '+') {
4401 *out++ = '+';
4402 *out++ = '-';
4403 }
4404 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4405 *out++ = (char) ch;
4406 }
4407 else {
4408 *out++ = '+';
4409 inShift = 1;
4410 goto encode_char;
4411 }
4412 }
4413 continue;
4414encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004416 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004417
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 /* code first surrogate */
4419 base64bits += 16;
4420 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4421 while (base64bits >= 6) {
4422 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4423 base64bits -= 6;
4424 }
4425 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004426 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 base64bits += 16;
4429 base64buffer = (base64buffer << 16) | ch;
4430 while (base64bits >= 6) {
4431 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4432 base64bits -= 6;
4433 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004434 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435 if (base64bits)
4436 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4437 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004439 if (_PyBytes_Resize(&v, out - start) < 0)
4440 return NULL;
4441 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004443PyObject *
4444PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4445 Py_ssize_t size,
4446 int base64SetO,
4447 int base64WhiteSpace,
4448 const char *errors)
4449{
4450 PyObject *result;
4451 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4452 if (tmp == NULL)
4453 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004454 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004455 base64WhiteSpace, errors);
4456 Py_DECREF(tmp);
4457 return result;
4458}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460#undef IS_BASE64
4461#undef FROM_BASE64
4462#undef TO_BASE64
4463#undef DECODE_DIRECT
4464#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466/* --- UTF-8 Codec -------------------------------------------------------- */
4467
Tim Petersced69f82003-09-16 20:30:58 +00004468static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004470 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4471 illegal prefix. See RFC 3629 for details */
4472 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4473 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004474 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4478 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4484 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4485 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4486 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4487 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488};
4489
Alexander Belopolsky40018472011-02-26 01:02:56 +00004490PyObject *
4491PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004492 Py_ssize_t size,
4493 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494{
Walter Dörwald69652032004-09-07 20:24:22 +00004495 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4496}
4497
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004498#include "stringlib/ucs1lib.h"
4499#include "stringlib/codecs.h"
4500#include "stringlib/undef.h"
4501
4502#include "stringlib/ucs2lib.h"
4503#include "stringlib/codecs.h"
4504#include "stringlib/undef.h"
4505
4506#include "stringlib/ucs4lib.h"
4507#include "stringlib/codecs.h"
4508#include "stringlib/undef.h"
4509
Antoine Pitrouab868312009-01-10 15:40:25 +00004510/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4511#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4512
4513/* Mask to quickly check whether a C 'long' contains a
4514 non-ASCII, UTF8-encoded char. */
4515#if (SIZEOF_LONG == 8)
4516# define ASCII_CHAR_MASK 0x8080808080808080L
4517#elif (SIZEOF_LONG == 4)
4518# define ASCII_CHAR_MASK 0x80808080L
4519#else
4520# error C 'long' size should be either 4 or 8!
4521#endif
4522
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004523/* Scans a UTF-8 string and returns the maximum character to be expected
4524 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004525
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004526 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004527 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004528 */
4529static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004530utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004532 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004533 const unsigned char *end = p + string_size;
4534 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004535
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004536 assert(unicode_size != NULL);
4537
4538 /* By having a cascade of independent loops which fallback onto each
4539 other, we minimize the amount of work done in the average loop
4540 iteration, and we also maximize the CPU's ability to predict
4541 branches correctly (because a given condition will have always the
4542 same boolean outcome except perhaps in the last iteration of the
4543 corresponding loop).
4544 In the general case this brings us rather close to decoding
4545 performance pre-PEP 393, despite the two-pass decoding.
4546
4547 Note that the pure ASCII loop is not duplicated once a non-ASCII
4548 character has been encountered. It is actually a pessimization (by
4549 a significant factor) to use this loop on text with many non-ASCII
4550 characters, and it is important to avoid bad performance on valid
4551 utf-8 data (invalid utf-8 being a different can of worms).
4552 */
4553
4554 /* ASCII */
4555 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004556 /* Only check value if it's not a ASCII char... */
4557 if (*p < 0x80) {
4558 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4559 an explanation. */
4560 if (!((size_t) p & LONG_PTR_MASK)) {
4561 /* Help register allocation */
4562 register const unsigned char *_p = p;
4563 while (_p < aligned_end) {
4564 unsigned long value = *(unsigned long *) _p;
4565 if (value & ASCII_CHAR_MASK)
4566 break;
4567 _p += SIZEOF_LONG;
4568 char_count += SIZEOF_LONG;
4569 }
4570 p = _p;
4571 if (p == end)
4572 break;
4573 }
4574 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004575 if (*p < 0x80)
4576 ++char_count;
4577 else
4578 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004580 *unicode_size = char_count;
4581 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004582
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004583_ucs1loop:
4584 for (; p < end; ++p) {
4585 if (*p < 0xc4)
4586 char_count += ((*p & 0xc0) != 0x80);
4587 else
4588 goto _ucs2loop;
4589 }
4590 *unicode_size = char_count;
4591 return 255;
4592
4593_ucs2loop:
4594 for (; p < end; ++p) {
4595 if (*p < 0xf0)
4596 char_count += ((*p & 0xc0) != 0x80);
4597 else
4598 goto _ucs4loop;
4599 }
4600 *unicode_size = char_count;
4601 return 65535;
4602
4603_ucs4loop:
4604 for (; p < end; ++p) {
4605 char_count += ((*p & 0xc0) != 0x80);
4606 }
4607 *unicode_size = char_count;
4608 return 65537;
4609}
4610
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004611/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004612 in case of errors. Implicit parameters: unicode, kind, data, onError.
4613 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614*/
Victor Stinner785938e2011-12-11 20:09:03 +01004615#define WRITE_MAYBE_FAIL(index, value) \
4616 do { \
4617 Py_ssize_t pos = index; \
4618 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4619 unicode_resize(&unicode, pos + pos/8) < 0) \
4620 goto onError; \
4621 if (unicode_putchar(&unicode, &pos, value) < 0) \
4622 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004623 } while (0)
4624
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004625static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004626decode_utf8_errors(const char *starts,
4627 Py_ssize_t size,
4628 const char *errors,
4629 Py_ssize_t *consumed,
4630 const char *s,
4631 PyObject *unicode,
4632 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004633{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004635 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004636 Py_ssize_t startinpos;
4637 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004638 const char *e = starts + size;
4639 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004640 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 PyObject *errorHandler = NULL;
4642 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004643
Antoine Pitrouab868312009-01-10 15:40:25 +00004644 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645
4646 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004647 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648
4649 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004650 /* Fast path for runs of ASCII characters. Given that common UTF-8
4651 input will consist of an overwhelming majority of ASCII
4652 characters, we try to optimize for this case by checking
4653 as many characters as a C 'long' can contain.
4654 First, check if we can do an aligned read, as most CPUs have
4655 a penalty for unaligned reads.
4656 */
4657 if (!((size_t) s & LONG_PTR_MASK)) {
4658 /* Help register allocation */
4659 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004660 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004661 while (_s < aligned_end) {
4662 /* Read a whole long at a time (either 4 or 8 bytes),
4663 and do a fast unrolled copy if it only contains ASCII
4664 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004665 unsigned long value = *(unsigned long *) _s;
4666 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004667 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004668 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4669 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4670 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4671 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004672#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004673 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4674 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4675 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4676 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004677#endif
4678 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004679 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004680 }
4681 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004683 if (s == e)
4684 break;
4685 ch = (unsigned char)*s;
4686 }
4687 }
4688
4689 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004690 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691 s++;
4692 continue;
4693 }
4694
4695 n = utf8_code_length[ch];
4696
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004697 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 if (consumed)
4699 break;
4700 else {
4701 errmsg = "unexpected end of data";
4702 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004703 endinpos = startinpos+1;
4704 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4705 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 goto utf8Error;
4707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709
4710 switch (n) {
4711
4712 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004713 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004714 startinpos = s-starts;
4715 endinpos = startinpos+1;
4716 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717
4718 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004719 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 startinpos = s-starts;
4721 endinpos = startinpos+1;
4722 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723
4724 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004725 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004726 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004728 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 goto utf8Error;
4730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004732 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004733 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 break;
4735
4736 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004737 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4738 will result in surrogates in range d800-dfff. Surrogates are
4739 not valid UTF-8 so they are rejected.
4740 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4741 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004742 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004743 (s[2] & 0xc0) != 0x80 ||
4744 ((unsigned char)s[0] == 0xE0 &&
4745 (unsigned char)s[1] < 0xA0) ||
4746 ((unsigned char)s[0] == 0xED &&
4747 (unsigned char)s[1] > 0x9F)) {
4748 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004750 endinpos = startinpos + 1;
4751
4752 /* if s[1] first two bits are 1 and 0, then the invalid
4753 continuation byte is s[2], so increment endinpos by 1,
4754 if not, s[1] is invalid and endinpos doesn't need to
4755 be incremented. */
4756 if ((s[1] & 0xC0) == 0x80)
4757 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 goto utf8Error;
4759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004761 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004762 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004763 break;
4764
4765 case 4:
4766 if ((s[1] & 0xc0) != 0x80 ||
4767 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004768 (s[3] & 0xc0) != 0x80 ||
4769 ((unsigned char)s[0] == 0xF0 &&
4770 (unsigned char)s[1] < 0x90) ||
4771 ((unsigned char)s[0] == 0xF4 &&
4772 (unsigned char)s[1] > 0x8F)) {
4773 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004775 endinpos = startinpos + 1;
4776 if ((s[1] & 0xC0) == 0x80) {
4777 endinpos++;
4778 if ((s[2] & 0xC0) == 0x80)
4779 endinpos++;
4780 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004781 goto utf8Error;
4782 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004783 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004784 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004785 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004786
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004787 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 }
4790 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004792
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 if (unicode_decode_call_errorhandler(
4795 errors, &errorHandler,
4796 "utf8", errmsg,
4797 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004798 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004799 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004800 /* Update data because unicode_decode_call_errorhandler might have
4801 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 }
Walter Dörwald69652032004-09-07 20:24:22 +00004804 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004807 /* Adjust length and ready string when it contained errors and
4808 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004809 if (unicode_resize(&unicode, i) < 0)
4810 goto onError;
4811 unicode_adjust_maxchar(&unicode);
4812 if (unicode == NULL)
4813 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815 Py_XDECREF(errorHandler);
4816 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004817 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004818 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 Py_XDECREF(errorHandler);
4822 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004823 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 return NULL;
4825}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004826#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004827
Victor Stinner785938e2011-12-11 20:09:03 +01004828PyObject *
4829PyUnicode_DecodeUTF8Stateful(const char *s,
4830 Py_ssize_t size,
4831 const char *errors,
4832 Py_ssize_t *consumed)
4833{
4834 Py_UCS4 maxchar = 0;
4835 Py_ssize_t unicode_size;
4836 int has_errors = 0;
4837 PyObject *unicode;
4838 int kind;
4839 void *data;
4840 const char *starts = s;
4841 const char *e;
4842 Py_ssize_t i;
4843
4844 if (size == 0) {
4845 if (consumed)
4846 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004847 Py_INCREF(unicode_empty);
4848 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004849 }
4850
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004851 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004852
4853 /* When the string is ASCII only, just use memcpy and return.
4854 unicode_size may be != size if there is an incomplete UTF-8
4855 sequence at the end of the ASCII block. */
4856 if (maxchar < 128 && size == unicode_size) {
4857 if (consumed)
4858 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004859 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004860 }
4861
4862 unicode = PyUnicode_New(unicode_size, maxchar);
4863 if (!unicode)
4864 return NULL;
4865 kind = PyUnicode_KIND(unicode);
4866 data = PyUnicode_DATA(unicode);
4867
4868 /* Unpack UTF-8 encoded data */
4869 i = 0;
4870 e = starts + size;
4871 switch (kind) {
4872 case PyUnicode_1BYTE_KIND:
4873 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4874 break;
4875 case PyUnicode_2BYTE_KIND:
4876 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4877 break;
4878 case PyUnicode_4BYTE_KIND:
4879 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4880 break;
4881 }
4882 if (!has_errors) {
4883 /* Ensure the unicode size calculation was correct */
4884 assert(i == unicode_size);
4885 assert(s == e);
4886 if (consumed)
4887 *consumed = size;
4888 return unicode;
4889 }
4890
4891 /* In case of errors, maxchar and size computation might be incorrect;
4892 code below refits and resizes as necessary. */
4893 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4894}
4895
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004896#ifdef __APPLE__
4897
4898/* Simplified UTF-8 decoder using surrogateescape error handler,
4899 used to decode the command line arguments on Mac OS X. */
4900
4901wchar_t*
4902_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4903{
4904 int n;
4905 const char *e;
4906 wchar_t *unicode, *p;
4907
4908 /* Note: size will always be longer than the resulting Unicode
4909 character count */
4910 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4911 PyErr_NoMemory();
4912 return NULL;
4913 }
4914 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4915 if (!unicode)
4916 return NULL;
4917
4918 /* Unpack UTF-8 encoded data */
4919 p = unicode;
4920 e = s + size;
4921 while (s < e) {
4922 Py_UCS4 ch = (unsigned char)*s;
4923
4924 if (ch < 0x80) {
4925 *p++ = (wchar_t)ch;
4926 s++;
4927 continue;
4928 }
4929
4930 n = utf8_code_length[ch];
4931 if (s + n > e) {
4932 goto surrogateescape;
4933 }
4934
4935 switch (n) {
4936 case 0:
4937 case 1:
4938 goto surrogateescape;
4939
4940 case 2:
4941 if ((s[1] & 0xc0) != 0x80)
4942 goto surrogateescape;
4943 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4944 assert ((ch > 0x007F) && (ch <= 0x07FF));
4945 *p++ = (wchar_t)ch;
4946 break;
4947
4948 case 3:
4949 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4950 will result in surrogates in range d800-dfff. Surrogates are
4951 not valid UTF-8 so they are rejected.
4952 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4953 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4954 if ((s[1] & 0xc0) != 0x80 ||
4955 (s[2] & 0xc0) != 0x80 ||
4956 ((unsigned char)s[0] == 0xE0 &&
4957 (unsigned char)s[1] < 0xA0) ||
4958 ((unsigned char)s[0] == 0xED &&
4959 (unsigned char)s[1] > 0x9F)) {
4960
4961 goto surrogateescape;
4962 }
4963 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4964 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004965 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004966 break;
4967
4968 case 4:
4969 if ((s[1] & 0xc0) != 0x80 ||
4970 (s[2] & 0xc0) != 0x80 ||
4971 (s[3] & 0xc0) != 0x80 ||
4972 ((unsigned char)s[0] == 0xF0 &&
4973 (unsigned char)s[1] < 0x90) ||
4974 ((unsigned char)s[0] == 0xF4 &&
4975 (unsigned char)s[1] > 0x8F)) {
4976 goto surrogateescape;
4977 }
4978 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4979 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004980 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004981
4982#if SIZEOF_WCHAR_T == 4
4983 *p++ = (wchar_t)ch;
4984#else
4985 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004986 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4987 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988#endif
4989 break;
4990 }
4991 s += n;
4992 continue;
4993
4994 surrogateescape:
4995 *p++ = 0xDC00 + ch;
4996 s++;
4997 }
4998 *p = L'\0';
4999 return unicode;
5000}
5001
5002#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005004/* Primary internal function which creates utf8 encoded bytes objects.
5005
5006 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005007 and allocate exactly as much space needed at the end. Else allocate the
5008 maximum possible needed (4 result bytes per Unicode character), and return
5009 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005010*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005011PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005012_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013{
Victor Stinner6099a032011-12-18 14:22:26 +01005014 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005015 void *data;
5016 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005018 if (!PyUnicode_Check(unicode)) {
5019 PyErr_BadArgument();
5020 return NULL;
5021 }
5022
5023 if (PyUnicode_READY(unicode) == -1)
5024 return NULL;
5025
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005026 if (PyUnicode_UTF8(unicode))
5027 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5028 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005029
5030 kind = PyUnicode_KIND(unicode);
5031 data = PyUnicode_DATA(unicode);
5032 size = PyUnicode_GET_LENGTH(unicode);
5033
Benjamin Petersonead6b532011-12-20 17:23:42 -06005034 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005035 default:
5036 assert(0);
5037 case PyUnicode_1BYTE_KIND:
5038 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5039 assert(!PyUnicode_IS_ASCII(unicode));
5040 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5041 case PyUnicode_2BYTE_KIND:
5042 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5043 case PyUnicode_4BYTE_KIND:
5044 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046}
5047
Alexander Belopolsky40018472011-02-26 01:02:56 +00005048PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005049PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5050 Py_ssize_t size,
5051 const char *errors)
5052{
5053 PyObject *v, *unicode;
5054
5055 unicode = PyUnicode_FromUnicode(s, size);
5056 if (unicode == NULL)
5057 return NULL;
5058 v = _PyUnicode_AsUTF8String(unicode, errors);
5059 Py_DECREF(unicode);
5060 return v;
5061}
5062
5063PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005064PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005066 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067}
5068
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069/* --- UTF-32 Codec ------------------------------------------------------- */
5070
5071PyObject *
5072PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 Py_ssize_t size,
5074 const char *errors,
5075 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076{
5077 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5078}
5079
5080PyObject *
5081PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 Py_ssize_t size,
5083 const char *errors,
5084 int *byteorder,
5085 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086{
5087 const char *starts = s;
5088 Py_ssize_t startinpos;
5089 Py_ssize_t endinpos;
5090 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005091 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005092 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093 int bo = 0; /* assume native ordering by default */
5094 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095 /* Offsets from q for retrieving bytes in the right order. */
5096#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5097 int iorder[] = {0, 1, 2, 3};
5098#else
5099 int iorder[] = {3, 2, 1, 0};
5100#endif
5101 PyObject *errorHandler = NULL;
5102 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005103
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104 q = (unsigned char *)s;
5105 e = q + size;
5106
5107 if (byteorder)
5108 bo = *byteorder;
5109
5110 /* Check for BOM marks (U+FEFF) in the input and adjust current
5111 byte order setting accordingly. In native mode, the leading BOM
5112 mark is skipped, in all other modes, it is copied to the output
5113 stream as-is (giving a ZWNBSP character). */
5114 if (bo == 0) {
5115 if (size >= 4) {
5116 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 if (bom == 0x0000FEFF) {
5120 q += 4;
5121 bo = -1;
5122 }
5123 else if (bom == 0xFFFE0000) {
5124 q += 4;
5125 bo = 1;
5126 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005127#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 if (bom == 0x0000FEFF) {
5129 q += 4;
5130 bo = 1;
5131 }
5132 else if (bom == 0xFFFE0000) {
5133 q += 4;
5134 bo = -1;
5135 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005136#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 }
5139
5140 if (bo == -1) {
5141 /* force LE */
5142 iorder[0] = 0;
5143 iorder[1] = 1;
5144 iorder[2] = 2;
5145 iorder[3] = 3;
5146 }
5147 else if (bo == 1) {
5148 /* force BE */
5149 iorder[0] = 3;
5150 iorder[1] = 2;
5151 iorder[2] = 1;
5152 iorder[3] = 0;
5153 }
5154
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005155 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005156 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005157 if (!unicode)
5158 return NULL;
5159 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005160 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005161 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005162
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 Py_UCS4 ch;
5165 /* remaining bytes at the end? (size should be divisible by 4) */
5166 if (e-q<4) {
5167 if (consumed)
5168 break;
5169 errmsg = "truncated data";
5170 startinpos = ((const char *)q)-starts;
5171 endinpos = ((const char *)e)-starts;
5172 goto utf32Error;
5173 /* The remaining input chars are ignored if the callback
5174 chooses to skip the input */
5175 }
5176 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5177 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005178
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 if (ch >= 0x110000)
5180 {
5181 errmsg = "codepoint not in range(0x110000)";
5182 startinpos = ((const char *)q)-starts;
5183 endinpos = startinpos+4;
5184 goto utf32Error;
5185 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005186 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5187 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 q += 4;
5189 continue;
5190 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 if (unicode_decode_call_errorhandler(
5192 errors, &errorHandler,
5193 "utf32", errmsg,
5194 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005195 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005197 }
5198
5199 if (byteorder)
5200 *byteorder = bo;
5201
5202 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005204
5205 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005206 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005207 goto onError;
5208
5209 Py_XDECREF(errorHandler);
5210 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005211 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005212
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005214 Py_DECREF(unicode);
5215 Py_XDECREF(errorHandler);
5216 Py_XDECREF(exc);
5217 return NULL;
5218}
5219
5220PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005221_PyUnicode_EncodeUTF32(PyObject *str,
5222 const char *errors,
5223 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005225 int kind;
5226 void *data;
5227 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005228 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005229 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005230 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005231 /* Offsets from p for storing byte pairs in the right order. */
5232#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5233 int iorder[] = {0, 1, 2, 3};
5234#else
5235 int iorder[] = {3, 2, 1, 0};
5236#endif
5237
Benjamin Peterson29060642009-01-31 22:14:21 +00005238#define STORECHAR(CH) \
5239 do { \
5240 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5241 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5242 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5243 p[iorder[0]] = (CH) & 0xff; \
5244 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245 } while(0)
5246
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005247 if (!PyUnicode_Check(str)) {
5248 PyErr_BadArgument();
5249 return NULL;
5250 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005251 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005252 return NULL;
5253 kind = PyUnicode_KIND(str);
5254 data = PyUnicode_DATA(str);
5255 len = PyUnicode_GET_LENGTH(str);
5256
5257 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005258 bytesize = nsize * 4;
5259 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005261 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005262 if (v == NULL)
5263 return NULL;
5264
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005265 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005266 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005268 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005269 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005270
5271 if (byteorder == -1) {
5272 /* force LE */
5273 iorder[0] = 0;
5274 iorder[1] = 1;
5275 iorder[2] = 2;
5276 iorder[3] = 3;
5277 }
5278 else if (byteorder == 1) {
5279 /* force BE */
5280 iorder[0] = 3;
5281 iorder[1] = 2;
5282 iorder[2] = 1;
5283 iorder[3] = 0;
5284 }
5285
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005286 for (i = 0; i < len; i++)
5287 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005288
5289 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005290 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005291#undef STORECHAR
5292}
5293
Alexander Belopolsky40018472011-02-26 01:02:56 +00005294PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005295PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5296 Py_ssize_t size,
5297 const char *errors,
5298 int byteorder)
5299{
5300 PyObject *result;
5301 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5302 if (tmp == NULL)
5303 return NULL;
5304 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5305 Py_DECREF(tmp);
5306 return result;
5307}
5308
5309PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005310PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005311{
Victor Stinnerb960b342011-11-20 19:12:52 +01005312 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005313}
5314
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315/* --- UTF-16 Codec ------------------------------------------------------- */
5316
Tim Peters772747b2001-08-09 22:21:55 +00005317PyObject *
5318PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 Py_ssize_t size,
5320 const char *errors,
5321 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322{
Walter Dörwald69652032004-09-07 20:24:22 +00005323 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5324}
5325
Antoine Pitrouab868312009-01-10 15:40:25 +00005326/* Two masks for fast checking of whether a C 'long' may contain
5327 UTF16-encoded surrogate characters. This is an efficient heuristic,
5328 assuming that non-surrogate characters with a code point >= 0x8000 are
5329 rare in most input.
5330 FAST_CHAR_MASK is used when the input is in native byte ordering,
5331 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005332*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005333#if (SIZEOF_LONG == 8)
5334# define FAST_CHAR_MASK 0x8000800080008000L
5335# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5336#elif (SIZEOF_LONG == 4)
5337# define FAST_CHAR_MASK 0x80008000L
5338# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5339#else
5340# error C 'long' size should be either 4 or 8!
5341#endif
5342
Walter Dörwald69652032004-09-07 20:24:22 +00005343PyObject *
5344PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 Py_ssize_t size,
5346 const char *errors,
5347 int *byteorder,
5348 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005349{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005351 Py_ssize_t startinpos;
5352 Py_ssize_t endinpos;
5353 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005354 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005355 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005356 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005357 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005358 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005359 /* Offsets from q for retrieving byte pairs in the right order. */
5360#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5361 int ihi = 1, ilo = 0;
5362#else
5363 int ihi = 0, ilo = 1;
5364#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365 PyObject *errorHandler = NULL;
5366 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367
5368 /* Note: size will always be longer than the resulting Unicode
5369 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005370 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 if (!unicode)
5372 return NULL;
5373 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005374 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005375 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376
Tim Peters772747b2001-08-09 22:21:55 +00005377 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005378 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379
5380 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005381 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005383 /* Check for BOM marks (U+FEFF) in the input and adjust current
5384 byte order setting accordingly. In native mode, the leading BOM
5385 mark is skipped, in all other modes, it is copied to the output
5386 stream as-is (giving a ZWNBSP character). */
5387 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005388 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005389 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005390#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 if (bom == 0xFEFF) {
5392 q += 2;
5393 bo = -1;
5394 }
5395 else if (bom == 0xFFFE) {
5396 q += 2;
5397 bo = 1;
5398 }
Tim Petersced69f82003-09-16 20:30:58 +00005399#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 if (bom == 0xFEFF) {
5401 q += 2;
5402 bo = 1;
5403 }
5404 else if (bom == 0xFFFE) {
5405 q += 2;
5406 bo = -1;
5407 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005408#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411
Tim Peters772747b2001-08-09 22:21:55 +00005412 if (bo == -1) {
5413 /* force LE */
5414 ihi = 1;
5415 ilo = 0;
5416 }
5417 else if (bo == 1) {
5418 /* force BE */
5419 ihi = 0;
5420 ilo = 1;
5421 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005422#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5423 native_ordering = ilo < ihi;
5424#else
5425 native_ordering = ilo > ihi;
5426#endif
Tim Peters772747b2001-08-09 22:21:55 +00005427
Antoine Pitrouab868312009-01-10 15:40:25 +00005428 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005429 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005430 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005431 /* First check for possible aligned read of a C 'long'. Unaligned
5432 reads are more expensive, better to defer to another iteration. */
5433 if (!((size_t) q & LONG_PTR_MASK)) {
5434 /* Fast path for runs of non-surrogate chars. */
5435 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005436 int kind = PyUnicode_KIND(unicode);
5437 void *data = PyUnicode_DATA(unicode);
5438 while (_q < aligned_end) {
5439 unsigned long block = * (unsigned long *) _q;
5440 unsigned short *pblock = (unsigned short*)&block;
5441 Py_UCS4 maxch;
5442 if (native_ordering) {
5443 /* Can use buffer directly */
5444 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005445 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005446 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005447 else {
5448 /* Need to byte-swap */
5449 unsigned char *_p = (unsigned char*)pblock;
5450 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005451 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005452 _p[0] = _q[1];
5453 _p[1] = _q[0];
5454 _p[2] = _q[3];
5455 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005456#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005457 _p[4] = _q[5];
5458 _p[5] = _q[4];
5459 _p[6] = _q[7];
5460 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005461#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005462 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005463 maxch = Py_MAX(pblock[0], pblock[1]);
5464#if SIZEOF_LONG == 8
5465 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5466#endif
5467 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5468 if (unicode_widen(&unicode, maxch) < 0)
5469 goto onError;
5470 kind = PyUnicode_KIND(unicode);
5471 data = PyUnicode_DATA(unicode);
5472 }
5473 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5474 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5475#if SIZEOF_LONG == 8
5476 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5477 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5478#endif
5479 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005480 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005481 q = _q;
5482 if (q >= e)
5483 break;
5484 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486
Benjamin Peterson14339b62009-01-31 16:36:08 +00005487 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005488
Victor Stinner551ac952011-11-29 22:58:13 +01005489 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005490 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5491 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 continue;
5493 }
5494
5495 /* UTF-16 code pair: */
5496 if (q > e) {
5497 errmsg = "unexpected end of data";
5498 startinpos = (((const char *)q) - 2) - starts;
5499 endinpos = ((const char *)e) + 1 - starts;
5500 goto utf16Error;
5501 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005502 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5503 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005505 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005506 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005507 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005508 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 continue;
5510 }
5511 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005512 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 startinpos = (((const char *)q)-4)-starts;
5514 endinpos = startinpos+2;
5515 goto utf16Error;
5516 }
5517
Benjamin Peterson14339b62009-01-31 16:36:08 +00005518 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 errmsg = "illegal encoding";
5520 startinpos = (((const char *)q)-2)-starts;
5521 endinpos = startinpos+2;
5522 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005523
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005526 errors,
5527 &errorHandler,
5528 "utf16", errmsg,
5529 &starts,
5530 (const char **)&e,
5531 &startinpos,
5532 &endinpos,
5533 &exc,
5534 (const char **)&q,
5535 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005536 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005539 /* remaining byte at the end? (size should be even) */
5540 if (e == q) {
5541 if (!consumed) {
5542 errmsg = "truncated data";
5543 startinpos = ((const char *)q) - starts;
5544 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005545 if (unicode_decode_call_errorhandler(
5546 errors,
5547 &errorHandler,
5548 "utf16", errmsg,
5549 &starts,
5550 (const char **)&e,
5551 &startinpos,
5552 &endinpos,
5553 &exc,
5554 (const char **)&q,
5555 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005556 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005557 goto onError;
5558 /* The remaining input chars are ignored if the callback
5559 chooses to skip the input */
5560 }
5561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
5563 if (byteorder)
5564 *byteorder = bo;
5565
Walter Dörwald69652032004-09-07 20:24:22 +00005566 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005568
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005570 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 goto onError;
5572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005573 Py_XDECREF(errorHandler);
5574 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005575 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005579 Py_XDECREF(errorHandler);
5580 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 return NULL;
5582}
5583
Antoine Pitrouab868312009-01-10 15:40:25 +00005584#undef FAST_CHAR_MASK
5585#undef SWAPPED_FAST_CHAR_MASK
5586
Tim Peters772747b2001-08-09 22:21:55 +00005587PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005588_PyUnicode_EncodeUTF16(PyObject *str,
5589 const char *errors,
5590 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005592 int kind;
5593 void *data;
5594 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005595 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005596 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005597 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005598 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005599 /* Offsets from p for storing byte pairs in the right order. */
5600#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5601 int ihi = 1, ilo = 0;
5602#else
5603 int ihi = 0, ilo = 1;
5604#endif
5605
Benjamin Peterson29060642009-01-31 22:14:21 +00005606#define STORECHAR(CH) \
5607 do { \
5608 p[ihi] = ((CH) >> 8) & 0xff; \
5609 p[ilo] = (CH) & 0xff; \
5610 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005611 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005613 if (!PyUnicode_Check(str)) {
5614 PyErr_BadArgument();
5615 return NULL;
5616 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005617 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005618 return NULL;
5619 kind = PyUnicode_KIND(str);
5620 data = PyUnicode_DATA(str);
5621 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005622
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005623 pairs = 0;
5624 if (kind == PyUnicode_4BYTE_KIND)
5625 for (i = 0; i < len; i++)
5626 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5627 pairs++;
5628 /* 2 * (len + pairs + (byteorder == 0)) */
5629 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005631 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005632 bytesize = nsize * 2;
5633 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005635 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 if (v == NULL)
5637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005639 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005642 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005643 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005644
5645 if (byteorder == -1) {
5646 /* force LE */
5647 ihi = 1;
5648 ilo = 0;
5649 }
5650 else if (byteorder == 1) {
5651 /* force BE */
5652 ihi = 0;
5653 ilo = 1;
5654 }
5655
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005656 for (i = 0; i < len; i++) {
5657 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5658 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005660 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5661 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 }
Tim Peters772747b2001-08-09 22:21:55 +00005663 STORECHAR(ch);
5664 if (ch2)
5665 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005666 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005667
5668 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005669 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005670#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671}
5672
Alexander Belopolsky40018472011-02-26 01:02:56 +00005673PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005674PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5675 Py_ssize_t size,
5676 const char *errors,
5677 int byteorder)
5678{
5679 PyObject *result;
5680 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5681 if (tmp == NULL)
5682 return NULL;
5683 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5684 Py_DECREF(tmp);
5685 return result;
5686}
5687
5688PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005689PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005691 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692}
5693
5694/* --- Unicode Escape Codec ----------------------------------------------- */
5695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5697 if all the escapes in the string make it still a valid ASCII string.
5698 Returns -1 if any escapes were found which cause the string to
5699 pop out of ASCII range. Otherwise returns the length of the
5700 required buffer to hold the string.
5701 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005702static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5704{
5705 const unsigned char *p = (const unsigned char *)s;
5706 const unsigned char *end = p + size;
5707 Py_ssize_t length = 0;
5708
5709 if (size < 0)
5710 return -1;
5711
5712 for (; p < end; ++p) {
5713 if (*p > 127) {
5714 /* Non-ASCII */
5715 return -1;
5716 }
5717 else if (*p != '\\') {
5718 /* Normal character */
5719 ++length;
5720 }
5721 else {
5722 /* Backslash-escape, check next char */
5723 ++p;
5724 /* Escape sequence reaches till end of string or
5725 non-ASCII follow-up. */
5726 if (p >= end || *p > 127)
5727 return -1;
5728 switch (*p) {
5729 case '\n':
5730 /* backslash + \n result in zero characters */
5731 break;
5732 case '\\': case '\'': case '\"':
5733 case 'b': case 'f': case 't':
5734 case 'n': case 'r': case 'v': case 'a':
5735 ++length;
5736 break;
5737 case '0': case '1': case '2': case '3':
5738 case '4': case '5': case '6': case '7':
5739 case 'x': case 'u': case 'U': case 'N':
5740 /* these do not guarantee ASCII characters */
5741 return -1;
5742 default:
5743 /* count the backslash + the other character */
5744 length += 2;
5745 }
5746 }
5747 }
5748 return length;
5749}
5750
Fredrik Lundh06d12682001-01-24 07:59:11 +00005751static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005752
Alexander Belopolsky40018472011-02-26 01:02:56 +00005753PyObject *
5754PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005755 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005759 Py_ssize_t startinpos;
5760 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005761 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005762 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005764 char* message;
5765 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 PyObject *errorHandler = NULL;
5767 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005768 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005769 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005770
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005771 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005772
5773 /* After length_of_escaped_ascii_string() there are two alternatives,
5774 either the string is pure ASCII with named escapes like \n, etc.
5775 and we determined it's exact size (common case)
5776 or it contains \x, \u, ... escape sequences. then we create a
5777 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005778 if (len >= 0) {
5779 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 if (!v)
5781 goto onError;
5782 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 }
5784 else {
5785 /* Escaped strings will always be longer than the resulting
5786 Unicode string, so we start with size here and then reduce the
5787 length after conversion to the true value.
5788 (but if the error callback returns a long replacement string
5789 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005790 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005791 if (!v)
5792 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005793 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005794 }
5795
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005797 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005798 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005800
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 while (s < end) {
5802 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005803 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005806 /* The only case in which i == ascii_length is a backslash
5807 followed by a newline. */
5808 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005809
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 /* Non-escape characters are interpreted as Unicode ordinals */
5811 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005812 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5813 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 continue;
5815 }
5816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 /* \ - Escapes */
5819 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005820 c = *s++;
5821 if (s > end)
5822 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005823
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005824 /* The only case in which i == ascii_length is a backslash
5825 followed by a newline. */
5826 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005827
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005828 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005831#define WRITECHAR(ch) \
5832 do { \
5833 if (unicode_putchar(&v, &i, ch) < 0) \
5834 goto onError; \
5835 }while(0)
5836
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005838 case '\\': WRITECHAR('\\'); break;
5839 case '\'': WRITECHAR('\''); break;
5840 case '\"': WRITECHAR('\"'); break;
5841 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005842 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005843 case 'f': WRITECHAR('\014'); break;
5844 case 't': WRITECHAR('\t'); break;
5845 case 'n': WRITECHAR('\n'); break;
5846 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005847 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005848 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005849 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005850 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 case '0': case '1': case '2': case '3':
5854 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005855 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005856 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005857 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005858 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005859 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005861 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 break;
5863
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 /* hex escapes */
5865 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005867 digits = 2;
5868 message = "truncated \\xXX escape";
5869 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005873 digits = 4;
5874 message = "truncated \\uXXXX escape";
5875 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005878 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005879 digits = 8;
5880 message = "truncated \\UXXXXXXXX escape";
5881 hexescape:
5882 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 if (s+digits>end) {
5884 endinpos = size;
5885 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 errors, &errorHandler,
5887 "unicodeescape", "end of string in escape sequence",
5888 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005889 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 goto onError;
5891 goto nextByte;
5892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005893 for (j = 0; j < digits; ++j) {
5894 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005895 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005896 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 errors, &errorHandler,
5899 "unicodeescape", message,
5900 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005901 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005902 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005903 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005905 }
5906 chr = (chr<<4) & ~0xF;
5907 if (c >= '0' && c <= '9')
5908 chr += c - '0';
5909 else if (c >= 'a' && c <= 'f')
5910 chr += 10 + c - 'a';
5911 else
5912 chr += 10 + c - 'A';
5913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005914 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005915 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 /* _decoding_error will have already written into the
5917 target buffer. */
5918 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005919 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005920 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005921 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005922 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005923 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 errors, &errorHandler,
5927 "unicodeescape", "illegal Unicode character",
5928 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005929 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005930 goto onError;
5931 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005932 break;
5933
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005935 case 'N':
5936 message = "malformed \\N character escape";
5937 if (ucnhash_CAPI == NULL) {
5938 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005939 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5940 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005941 if (ucnhash_CAPI == NULL)
5942 goto ucnhashError;
5943 }
5944 if (*s == '{') {
5945 const char *start = s+1;
5946 /* look for the closing brace */
5947 while (*s != '}' && s < end)
5948 s++;
5949 if (s > start && s < end && *s == '}') {
5950 /* found a name. look it up in the unicode database */
5951 message = "unknown Unicode character name";
5952 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005953 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005954 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005955 goto store;
5956 }
5957 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 errors, &errorHandler,
5961 "unicodeescape", message,
5962 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005963 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005964 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005965 break;
5966
5967 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005968 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005969 message = "\\ at end of string";
5970 s--;
5971 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 errors, &errorHandler,
5974 "unicodeescape", message,
5975 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005976 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005977 goto onError;
5978 }
5979 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005980 WRITECHAR('\\');
5981 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005988#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005989
Victor Stinner16e6a802011-12-12 13:24:15 +01005990 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005991 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005992 Py_XDECREF(errorHandler);
5993 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005994 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005995
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005997 PyErr_SetString(
5998 PyExc_UnicodeError,
5999 "\\N escapes not supported (can't load unicodedata module)"
6000 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006001 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 Py_XDECREF(errorHandler);
6003 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006004 return NULL;
6005
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 Py_XDECREF(errorHandler);
6009 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 return NULL;
6011}
6012
6013/* Return a Unicode-Escape string version of the Unicode object.
6014
6015 If quotes is true, the string is enclosed in u"" or u'' quotes as
6016 appropriate.
6017
6018*/
6019
Alexander Belopolsky40018472011-02-26 01:02:56 +00006020PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006021PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006026 int kind;
6027 void *data;
6028 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
Thomas Wouters89f507f2006-12-13 04:49:30 +00006030 /* Initial allocation is based on the longest-possible unichr
6031 escape.
6032
6033 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6034 unichr, so in this case it's the longest unichr escape. In
6035 narrow (UTF-16) builds this is five chars per source unichr
6036 since there are two unichrs in the surrogate pair, so in narrow
6037 (UTF-16) builds it's not the longest unichr escape.
6038
6039 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6040 so in the narrow (UTF-16) build case it's the longest unichr
6041 escape.
6042 */
6043
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006044 if (!PyUnicode_Check(unicode)) {
6045 PyErr_BadArgument();
6046 return NULL;
6047 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006048 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006049 return NULL;
6050 len = PyUnicode_GET_LENGTH(unicode);
6051 kind = PyUnicode_KIND(unicode);
6052 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006053 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006054 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6055 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6056 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6057 }
6058
6059 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006060 return PyBytes_FromStringAndSize(NULL, 0);
6061
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006062 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006064
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006065 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006067 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 if (repr == NULL)
6070 return NULL;
6071
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006072 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006074 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006075 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006076
Walter Dörwald79e913e2007-05-12 11:08:06 +00006077 /* Escape backslashes */
6078 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 *p++ = '\\';
6080 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006081 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006082 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006083
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006084 /* Map 21-bit characters to '\U00xxxxxx' */
6085 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006086 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006087 *p++ = '\\';
6088 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006089 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6090 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6091 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6092 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6093 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6094 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6095 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6096 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006098 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006099
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006101 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 *p++ = '\\';
6103 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006104 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6105 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6106 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6107 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006109
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006110 /* Map special whitespace to '\t', \n', '\r' */
6111 else if (ch == '\t') {
6112 *p++ = '\\';
6113 *p++ = 't';
6114 }
6115 else if (ch == '\n') {
6116 *p++ = '\\';
6117 *p++ = 'n';
6118 }
6119 else if (ch == '\r') {
6120 *p++ = '\\';
6121 *p++ = 'r';
6122 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006123
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006124 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006125 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006127 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006128 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6129 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006130 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006131
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 /* Copy everything else as-is */
6133 else
6134 *p++ = (char) ch;
6135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006137 assert(p - PyBytes_AS_STRING(repr) > 0);
6138 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6139 return NULL;
6140 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141}
6142
Alexander Belopolsky40018472011-02-26 01:02:56 +00006143PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6145 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 PyObject *result;
6148 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6149 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151 result = PyUnicode_AsUnicodeEscapeString(tmp);
6152 Py_DECREF(tmp);
6153 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154}
6155
6156/* --- Raw Unicode Escape Codec ------------------------------------------- */
6157
Alexander Belopolsky40018472011-02-26 01:02:56 +00006158PyObject *
6159PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006160 Py_ssize_t size,
6161 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006164 Py_ssize_t startinpos;
6165 Py_ssize_t endinpos;
6166 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006167 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 const char *end;
6169 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170 PyObject *errorHandler = NULL;
6171 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006172
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 /* Escaped strings will always be longer than the resulting
6174 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006175 length after conversion to the true value. (But decoding error
6176 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006177 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006181 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006182 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 end = s + size;
6184 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 unsigned char c;
6186 Py_UCS4 x;
6187 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006188 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 /* Non-escape characters are interpreted as Unicode ordinals */
6191 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006192 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6193 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006195 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 startinpos = s-starts;
6197
6198 /* \u-escapes are only interpreted iff the number of leading
6199 backslashes if odd */
6200 bs = s;
6201 for (;s < end;) {
6202 if (*s != '\\')
6203 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006204 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6205 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 }
6207 if (((s - bs) & 1) == 0 ||
6208 s >= end ||
6209 (*s != 'u' && *s != 'U')) {
6210 continue;
6211 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006212 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 count = *s=='u' ? 4 : 8;
6214 s++;
6215
6216 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 for (x = 0, i = 0; i < count; ++i, ++s) {
6218 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006219 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 endinpos = s-starts;
6221 if (unicode_decode_call_errorhandler(
6222 errors, &errorHandler,
6223 "rawunicodeescape", "truncated \\uXXXX",
6224 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006225 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 goto onError;
6227 goto nextByte;
6228 }
6229 x = (x<<4) & ~0xF;
6230 if (c >= '0' && c <= '9')
6231 x += c - '0';
6232 else if (c >= 'a' && c <= 'f')
6233 x += 10 + c - 'a';
6234 else
6235 x += 10 + c - 'A';
6236 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006237 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006238 if (unicode_putchar(&v, &outpos, x) < 0)
6239 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006240 } else {
6241 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006242 if (unicode_decode_call_errorhandler(
6243 errors, &errorHandler,
6244 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006246 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006248 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 nextByte:
6250 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006252 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254 Py_XDECREF(errorHandler);
6255 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006256 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006257
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006260 Py_XDECREF(errorHandler);
6261 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 return NULL;
6263}
6264
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006265
Alexander Belopolsky40018472011-02-26 01:02:56 +00006266PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006267PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006269 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 char *p;
6271 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006272 Py_ssize_t expandsize, pos;
6273 int kind;
6274 void *data;
6275 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 if (!PyUnicode_Check(unicode)) {
6278 PyErr_BadArgument();
6279 return NULL;
6280 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006281 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006282 return NULL;
6283 kind = PyUnicode_KIND(unicode);
6284 data = PyUnicode_DATA(unicode);
6285 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006286 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6287 bytes, and 1 byte characters 4. */
6288 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006289
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006290 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006292
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006293 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 if (repr == NULL)
6295 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006296 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006297 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006299 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006300 for (pos = 0; pos < len; pos++) {
6301 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 /* Map 32-bit characters to '\Uxxxxxxxx' */
6303 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006304 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006305 *p++ = '\\';
6306 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006307 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6308 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6309 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6310 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6311 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6312 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6313 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6314 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006315 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006317 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 *p++ = '\\';
6319 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006320 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6321 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6322 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6323 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 /* Copy everything else as-is */
6326 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 *p++ = (char) ch;
6328 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006329
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330 assert(p > q);
6331 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006332 return NULL;
6333 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334}
6335
Alexander Belopolsky40018472011-02-26 01:02:56 +00006336PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006337PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6338 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006340 PyObject *result;
6341 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6342 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006343 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006344 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6345 Py_DECREF(tmp);
6346 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347}
6348
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006349/* --- Unicode Internal Codec ------------------------------------------- */
6350
Alexander Belopolsky40018472011-02-26 01:02:56 +00006351PyObject *
6352_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006353 Py_ssize_t size,
6354 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006355{
6356 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006357 Py_ssize_t startinpos;
6358 Py_ssize_t endinpos;
6359 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006360 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006361 const char *end;
6362 const char *reason;
6363 PyObject *errorHandler = NULL;
6364 PyObject *exc = NULL;
6365
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006366 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006367 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006368 1))
6369 return NULL;
6370
Thomas Wouters89f507f2006-12-13 04:49:30 +00006371 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006372 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006373 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006375 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006376 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006377 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006378 end = s + size;
6379
6380 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006381 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006382 Py_UCS4 ch;
6383 /* We copy the raw representation one byte at a time because the
6384 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006385 ((char *) &uch)[0] = s[0];
6386 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006387#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006388 ((char *) &uch)[2] = s[2];
6389 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006390#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006391 ch = uch;
6392
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006393 /* We have to sanity check the raw data, otherwise doom looms for
6394 some malformed UCS-4 data. */
6395 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006396#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006397 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006398#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006399 end-s < Py_UNICODE_SIZE
6400 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006402 startinpos = s - starts;
6403 if (end-s < Py_UNICODE_SIZE) {
6404 endinpos = end-starts;
6405 reason = "truncated input";
6406 }
6407 else {
6408 endinpos = s - starts + Py_UNICODE_SIZE;
6409 reason = "illegal code point (> 0x10FFFF)";
6410 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006411 if (unicode_decode_call_errorhandler(
6412 errors, &errorHandler,
6413 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006414 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006415 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006416 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006417 continue;
6418 }
6419
6420 s += Py_UNICODE_SIZE;
6421#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006422 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006423 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006424 Py_UNICODE uch2;
6425 ((char *) &uch2)[0] = s[0];
6426 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006427 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006428 {
Victor Stinner551ac952011-11-29 22:58:13 +01006429 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006430 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006431 }
6432 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006433#endif
6434
6435 if (unicode_putchar(&v, &outpos, ch) < 0)
6436 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006437 }
6438
Victor Stinner16e6a802011-12-12 13:24:15 +01006439 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006440 goto onError;
6441 Py_XDECREF(errorHandler);
6442 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006443 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006444
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006446 Py_XDECREF(v);
6447 Py_XDECREF(errorHandler);
6448 Py_XDECREF(exc);
6449 return NULL;
6450}
6451
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452/* --- Latin-1 Codec ------------------------------------------------------ */
6453
Alexander Belopolsky40018472011-02-26 01:02:56 +00006454PyObject *
6455PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006456 Py_ssize_t size,
6457 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006460 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461}
6462
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006464static void
6465make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006466 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006467 PyObject *unicode,
6468 Py_ssize_t startpos, Py_ssize_t endpos,
6469 const char *reason)
6470{
6471 if (*exceptionObject == NULL) {
6472 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006473 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006474 encoding, unicode, startpos, endpos, reason);
6475 }
6476 else {
6477 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6478 goto onError;
6479 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6480 goto onError;
6481 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6482 goto onError;
6483 return;
6484 onError:
6485 Py_DECREF(*exceptionObject);
6486 *exceptionObject = NULL;
6487 }
6488}
6489
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006491static void
6492raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006493 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006494 PyObject *unicode,
6495 Py_ssize_t startpos, Py_ssize_t endpos,
6496 const char *reason)
6497{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006498 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006499 encoding, unicode, startpos, endpos, reason);
6500 if (*exceptionObject != NULL)
6501 PyCodec_StrictErrors(*exceptionObject);
6502}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006503
6504/* error handling callback helper:
6505 build arguments, call the callback and check the arguments,
6506 put the result into newpos and return the replacement string, which
6507 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006508static PyObject *
6509unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006510 PyObject **errorHandler,
6511 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006513 Py_ssize_t startpos, Py_ssize_t endpos,
6514 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006515{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006516 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006517 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006518 PyObject *restuple;
6519 PyObject *resunicode;
6520
6521 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006524 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006525 }
6526
Benjamin Petersonbac79492012-01-14 13:34:47 -05006527 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 return NULL;
6529 len = PyUnicode_GET_LENGTH(unicode);
6530
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006531 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006532 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006533 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006535
6536 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006538 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006540 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006541 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 Py_DECREF(restuple);
6543 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006545 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 &resunicode, newpos)) {
6547 Py_DECREF(restuple);
6548 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006549 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006550 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6551 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6552 Py_DECREF(restuple);
6553 return NULL;
6554 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006555 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556 *newpos = len + *newpos;
6557 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6559 Py_DECREF(restuple);
6560 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006561 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 Py_INCREF(resunicode);
6563 Py_DECREF(restuple);
6564 return resunicode;
6565}
6566
Alexander Belopolsky40018472011-02-26 01:02:56 +00006567static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006568unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006569 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006570 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006571{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006572 /* input state */
6573 Py_ssize_t pos=0, size;
6574 int kind;
6575 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006576 /* output object */
6577 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006578 /* pointer into the output */
6579 char *str;
6580 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006581 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006582 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6583 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006584 PyObject *errorHandler = NULL;
6585 PyObject *exc = NULL;
6586 /* the following variable is used for caching string comparisons
6587 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6588 int known_errorHandler = -1;
6589
Benjamin Petersonbac79492012-01-14 13:34:47 -05006590 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006591 return NULL;
6592 size = PyUnicode_GET_LENGTH(unicode);
6593 kind = PyUnicode_KIND(unicode);
6594 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595 /* allocate enough for a simple encoding without
6596 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006597 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006598 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006599 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006601 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006602 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 ressize = size;
6604
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 while (pos < size) {
6606 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006607
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 /* can we encode this? */
6609 if (c<limit) {
6610 /* no overflow check, because we know that the space is enough */
6611 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006613 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 Py_ssize_t requiredsize;
6616 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006617 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 Py_ssize_t collstart = pos;
6620 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006622 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 ++collend;
6624 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6625 if (known_errorHandler==-1) {
6626 if ((errors==NULL) || (!strcmp(errors, "strict")))
6627 known_errorHandler = 1;
6628 else if (!strcmp(errors, "replace"))
6629 known_errorHandler = 2;
6630 else if (!strcmp(errors, "ignore"))
6631 known_errorHandler = 3;
6632 else if (!strcmp(errors, "xmlcharrefreplace"))
6633 known_errorHandler = 4;
6634 else
6635 known_errorHandler = 0;
6636 }
6637 switch (known_errorHandler) {
6638 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006639 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 goto onError;
6641 case 2: /* replace */
6642 while (collstart++<collend)
6643 *str++ = '?'; /* fall through */
6644 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 break;
6647 case 4: /* xmlcharrefreplace */
6648 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 /* determine replacement size */
6650 for (i = collstart, repsize = 0; i < collend; ++i) {
6651 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6652 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006654 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006656 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006660 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006664 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006665 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006667 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006669 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 if (requiredsize > ressize) {
6671 if (requiredsize<2*ressize)
6672 requiredsize = 2*ressize;
6673 if (_PyBytes_Resize(&res, requiredsize))
6674 goto onError;
6675 str = PyBytes_AS_STRING(res) + respos;
6676 ressize = requiredsize;
6677 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 /* generate replacement */
6679 for (i = collstart; i < collend; ++i) {
6680 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 break;
6684 default:
6685 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006686 encoding, reason, unicode, &exc,
6687 collstart, collend, &newpos);
6688 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006689 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006691 if (PyBytes_Check(repunicode)) {
6692 /* Directly copy bytes result to output. */
6693 repsize = PyBytes_Size(repunicode);
6694 if (repsize > 1) {
6695 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006696 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006697 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6698 Py_DECREF(repunicode);
6699 goto onError;
6700 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006701 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006702 ressize += repsize-1;
6703 }
6704 memcpy(str, PyBytes_AsString(repunicode), repsize);
6705 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006706 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006707 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006708 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006709 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 /* need more space? (at least enough for what we
6711 have+the replacement+the rest of the string, so
6712 we won't have to check space for encodable characters) */
6713 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006714 repsize = PyUnicode_GET_LENGTH(repunicode);
6715 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 if (requiredsize > ressize) {
6717 if (requiredsize<2*ressize)
6718 requiredsize = 2*ressize;
6719 if (_PyBytes_Resize(&res, requiredsize)) {
6720 Py_DECREF(repunicode);
6721 goto onError;
6722 }
6723 str = PyBytes_AS_STRING(res) + respos;
6724 ressize = requiredsize;
6725 }
6726 /* check if there is anything unencodable in the replacement
6727 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006728 for (i = 0; repsize-->0; ++i, ++str) {
6729 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006731 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006732 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 Py_DECREF(repunicode);
6734 goto onError;
6735 }
6736 *str = (char)c;
6737 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006739 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006740 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006741 }
6742 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006743 /* Resize if we allocated to much */
6744 size = str - PyBytes_AS_STRING(res);
6745 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006746 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006747 if (_PyBytes_Resize(&res, size) < 0)
6748 goto onError;
6749 }
6750
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751 Py_XDECREF(errorHandler);
6752 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006753 return res;
6754
6755 onError:
6756 Py_XDECREF(res);
6757 Py_XDECREF(errorHandler);
6758 Py_XDECREF(exc);
6759 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006760}
6761
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006762/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763PyObject *
6764PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006765 Py_ssize_t size,
6766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 PyObject *result;
6769 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6770 if (unicode == NULL)
6771 return NULL;
6772 result = unicode_encode_ucs1(unicode, errors, 256);
6773 Py_DECREF(unicode);
6774 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775}
6776
Alexander Belopolsky40018472011-02-26 01:02:56 +00006777PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006778_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779{
6780 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 PyErr_BadArgument();
6782 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006784 if (PyUnicode_READY(unicode) == -1)
6785 return NULL;
6786 /* Fast path: if it is a one-byte string, construct
6787 bytes object directly. */
6788 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6789 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6790 PyUnicode_GET_LENGTH(unicode));
6791 /* Non-Latin-1 characters present. Defer to above function to
6792 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006793 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006794}
6795
6796PyObject*
6797PyUnicode_AsLatin1String(PyObject *unicode)
6798{
6799 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800}
6801
6802/* --- 7-bit ASCII Codec -------------------------------------------------- */
6803
Alexander Belopolsky40018472011-02-26 01:02:56 +00006804PyObject *
6805PyUnicode_DecodeASCII(const char *s,
6806 Py_ssize_t size,
6807 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006809 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006810 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006811 int kind;
6812 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006813 Py_ssize_t startinpos;
6814 Py_ssize_t endinpos;
6815 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006816 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006817 int has_error;
6818 const unsigned char *p = (const unsigned char *)s;
6819 const unsigned char *end = p + size;
6820 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006821 PyObject *errorHandler = NULL;
6822 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006823
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006824 if (size == 0) {
6825 Py_INCREF(unicode_empty);
6826 return unicode_empty;
6827 }
6828
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006830 if (size == 1 && (unsigned char)s[0] < 128)
6831 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006832
Victor Stinner702c7342011-10-05 13:50:52 +02006833 has_error = 0;
6834 while (p < end && !has_error) {
6835 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6836 an explanation. */
6837 if (!((size_t) p & LONG_PTR_MASK)) {
6838 /* Help register allocation */
6839 register const unsigned char *_p = p;
6840 while (_p < aligned_end) {
6841 unsigned long value = *(unsigned long *) _p;
6842 if (value & ASCII_CHAR_MASK) {
6843 has_error = 1;
6844 break;
6845 }
6846 _p += SIZEOF_LONG;
6847 }
6848 if (_p == end)
6849 break;
6850 if (has_error)
6851 break;
6852 p = _p;
6853 }
6854 if (*p & 0x80) {
6855 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006856 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006857 }
6858 else {
6859 ++p;
6860 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006861 }
Victor Stinner702c7342011-10-05 13:50:52 +02006862 if (!has_error)
6863 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006864
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006865 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006869 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006870 kind = PyUnicode_KIND(v);
6871 data = PyUnicode_DATA(v);
6872 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006873 e = s + size;
6874 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 register unsigned char c = (unsigned char)*s;
6876 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006877 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 ++s;
6879 }
6880 else {
6881 startinpos = s-starts;
6882 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 if (unicode_decode_call_errorhandler(
6884 errors, &errorHandler,
6885 "ascii", "ordinal not in range(128)",
6886 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006887 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006889 kind = PyUnicode_KIND(v);
6890 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006893 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006894 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895 Py_XDECREF(errorHandler);
6896 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006897 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006898 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006899
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006902 Py_XDECREF(errorHandler);
6903 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 return NULL;
6905}
6906
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006907/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006908PyObject *
6909PyUnicode_EncodeASCII(const Py_UNICODE *p,
6910 Py_ssize_t size,
6911 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006913 PyObject *result;
6914 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6915 if (unicode == NULL)
6916 return NULL;
6917 result = unicode_encode_ucs1(unicode, errors, 128);
6918 Py_DECREF(unicode);
6919 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920}
6921
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006923_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924{
6925 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 PyErr_BadArgument();
6927 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006929 if (PyUnicode_READY(unicode) == -1)
6930 return NULL;
6931 /* Fast path: if it is an ASCII-only string, construct bytes object
6932 directly. Else defer to above function to raise the exception. */
6933 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6934 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6935 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006936 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006937}
6938
6939PyObject *
6940PyUnicode_AsASCIIString(PyObject *unicode)
6941{
6942 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943}
6944
Victor Stinner99b95382011-07-04 14:23:54 +02006945#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006946
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006947/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006948
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006949#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950#define NEED_RETRY
6951#endif
6952
Victor Stinner3a50e702011-10-18 21:21:00 +02006953#ifndef WC_ERR_INVALID_CHARS
6954# define WC_ERR_INVALID_CHARS 0x0080
6955#endif
6956
6957static char*
6958code_page_name(UINT code_page, PyObject **obj)
6959{
6960 *obj = NULL;
6961 if (code_page == CP_ACP)
6962 return "mbcs";
6963 if (code_page == CP_UTF7)
6964 return "CP_UTF7";
6965 if (code_page == CP_UTF8)
6966 return "CP_UTF8";
6967
6968 *obj = PyBytes_FromFormat("cp%u", code_page);
6969 if (*obj == NULL)
6970 return NULL;
6971 return PyBytes_AS_STRING(*obj);
6972}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006973
Alexander Belopolsky40018472011-02-26 01:02:56 +00006974static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006975is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976{
6977 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006978 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979
Victor Stinner3a50e702011-10-18 21:21:00 +02006980 if (!IsDBCSLeadByteEx(code_page, *curr))
6981 return 0;
6982
6983 prev = CharPrevExA(code_page, s, curr, 0);
6984 if (prev == curr)
6985 return 1;
6986 /* FIXME: This code is limited to "true" double-byte encodings,
6987 as it assumes an incomplete character consists of a single
6988 byte. */
6989 if (curr - prev == 2)
6990 return 1;
6991 if (!IsDBCSLeadByteEx(code_page, *prev))
6992 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006993 return 0;
6994}
6995
Victor Stinner3a50e702011-10-18 21:21:00 +02006996static DWORD
6997decode_code_page_flags(UINT code_page)
6998{
6999 if (code_page == CP_UTF7) {
7000 /* The CP_UTF7 decoder only supports flags=0 */
7001 return 0;
7002 }
7003 else
7004 return MB_ERR_INVALID_CHARS;
7005}
7006
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007008 * Decode a byte string from a Windows code page into unicode object in strict
7009 * mode.
7010 *
7011 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7012 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007013 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007014static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007015decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007016 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 const char *in,
7018 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007019{
Victor Stinner3a50e702011-10-18 21:21:00 +02007020 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007021 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007022 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023
7024 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 assert(insize > 0);
7026 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7027 if (outsize <= 0)
7028 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029
7030 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007032 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 if (*v == NULL)
7035 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007036 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037 }
7038 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007040 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007041 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044 }
7045
7046 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007047 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7048 if (outsize <= 0)
7049 goto error;
7050 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007051
Victor Stinner3a50e702011-10-18 21:21:00 +02007052error:
7053 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7054 return -2;
7055 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007056 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057}
7058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059/*
7060 * Decode a byte string from a code page into unicode object with an error
7061 * handler.
7062 *
7063 * Returns consumed size if succeed, or raise a WindowsError or
7064 * UnicodeDecodeError exception and returns -1 on error.
7065 */
7066static int
7067decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007068 PyObject **v,
7069 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007070 const char *errors)
7071{
7072 const char *startin = in;
7073 const char *endin = in + size;
7074 const DWORD flags = decode_code_page_flags(code_page);
7075 /* Ideally, we should get reason from FormatMessage. This is the Windows
7076 2000 English version of the message. */
7077 const char *reason = "No mapping for the Unicode character exists "
7078 "in the target code page.";
7079 /* each step cannot decode more than 1 character, but a character can be
7080 represented as a surrogate pair */
7081 wchar_t buffer[2], *startout, *out;
7082 int insize, outsize;
7083 PyObject *errorHandler = NULL;
7084 PyObject *exc = NULL;
7085 PyObject *encoding_obj = NULL;
7086 char *encoding;
7087 DWORD err;
7088 int ret = -1;
7089
7090 assert(size > 0);
7091
7092 encoding = code_page_name(code_page, &encoding_obj);
7093 if (encoding == NULL)
7094 return -1;
7095
7096 if (errors == NULL || strcmp(errors, "strict") == 0) {
7097 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7098 UnicodeDecodeError. */
7099 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7100 if (exc != NULL) {
7101 PyCodec_StrictErrors(exc);
7102 Py_CLEAR(exc);
7103 }
7104 goto error;
7105 }
7106
7107 if (*v == NULL) {
7108 /* Create unicode object */
7109 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7110 PyErr_NoMemory();
7111 goto error;
7112 }
Victor Stinnerab595942011-12-17 04:59:06 +01007113 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007114 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 if (*v == NULL)
7116 goto error;
7117 startout = PyUnicode_AS_UNICODE(*v);
7118 }
7119 else {
7120 /* Extend unicode object */
7121 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7122 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7123 PyErr_NoMemory();
7124 goto error;
7125 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007126 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 goto error;
7128 startout = PyUnicode_AS_UNICODE(*v) + n;
7129 }
7130
7131 /* Decode the byte string character per character */
7132 out = startout;
7133 while (in < endin)
7134 {
7135 /* Decode a character */
7136 insize = 1;
7137 do
7138 {
7139 outsize = MultiByteToWideChar(code_page, flags,
7140 in, insize,
7141 buffer, Py_ARRAY_LENGTH(buffer));
7142 if (outsize > 0)
7143 break;
7144 err = GetLastError();
7145 if (err != ERROR_NO_UNICODE_TRANSLATION
7146 && err != ERROR_INSUFFICIENT_BUFFER)
7147 {
7148 PyErr_SetFromWindowsErr(0);
7149 goto error;
7150 }
7151 insize++;
7152 }
7153 /* 4=maximum length of a UTF-8 sequence */
7154 while (insize <= 4 && (in + insize) <= endin);
7155
7156 if (outsize <= 0) {
7157 Py_ssize_t startinpos, endinpos, outpos;
7158
7159 startinpos = in - startin;
7160 endinpos = startinpos + 1;
7161 outpos = out - PyUnicode_AS_UNICODE(*v);
7162 if (unicode_decode_call_errorhandler(
7163 errors, &errorHandler,
7164 encoding, reason,
7165 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007166 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 {
7168 goto error;
7169 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007170 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 }
7172 else {
7173 in += insize;
7174 memcpy(out, buffer, outsize * sizeof(wchar_t));
7175 out += outsize;
7176 }
7177 }
7178
7179 /* write a NUL character at the end */
7180 *out = 0;
7181
7182 /* Extend unicode object */
7183 outsize = out - startout;
7184 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007185 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007187 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007188
7189error:
7190 Py_XDECREF(encoding_obj);
7191 Py_XDECREF(errorHandler);
7192 Py_XDECREF(exc);
7193 return ret;
7194}
7195
Victor Stinner3a50e702011-10-18 21:21:00 +02007196static PyObject *
7197decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007198 const char *s, Py_ssize_t size,
7199 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007200{
Victor Stinner76a31a62011-11-04 00:05:13 +01007201 PyObject *v = NULL;
7202 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007203
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 if (code_page < 0) {
7205 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7206 return NULL;
7207 }
7208
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007211
Victor Stinner76a31a62011-11-04 00:05:13 +01007212 do
7213 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 if (size > INT_MAX) {
7216 chunk_size = INT_MAX;
7217 final = 0;
7218 done = 0;
7219 }
7220 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007221#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007222 {
7223 chunk_size = (int)size;
7224 final = (consumed == NULL);
7225 done = 1;
7226 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007227
Victor Stinner76a31a62011-11-04 00:05:13 +01007228 /* Skip trailing lead-byte unless 'final' is set */
7229 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7230 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007231
Victor Stinner76a31a62011-11-04 00:05:13 +01007232 if (chunk_size == 0 && done) {
7233 if (v != NULL)
7234 break;
7235 Py_INCREF(unicode_empty);
7236 return unicode_empty;
7237 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007238
Victor Stinner76a31a62011-11-04 00:05:13 +01007239
7240 converted = decode_code_page_strict(code_page, &v,
7241 s, chunk_size);
7242 if (converted == -2)
7243 converted = decode_code_page_errors(code_page, &v,
7244 s, chunk_size,
7245 errors);
7246 assert(converted != 0);
7247
7248 if (converted < 0) {
7249 Py_XDECREF(v);
7250 return NULL;
7251 }
7252
7253 if (consumed)
7254 *consumed += converted;
7255
7256 s += converted;
7257 size -= converted;
7258 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007259
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007260 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261}
7262
Alexander Belopolsky40018472011-02-26 01:02:56 +00007263PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007264PyUnicode_DecodeCodePageStateful(int code_page,
7265 const char *s,
7266 Py_ssize_t size,
7267 const char *errors,
7268 Py_ssize_t *consumed)
7269{
7270 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7271}
7272
7273PyObject *
7274PyUnicode_DecodeMBCSStateful(const char *s,
7275 Py_ssize_t size,
7276 const char *errors,
7277 Py_ssize_t *consumed)
7278{
7279 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7280}
7281
7282PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007283PyUnicode_DecodeMBCS(const char *s,
7284 Py_ssize_t size,
7285 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007286{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7288}
7289
Victor Stinner3a50e702011-10-18 21:21:00 +02007290static DWORD
7291encode_code_page_flags(UINT code_page, const char *errors)
7292{
7293 if (code_page == CP_UTF8) {
7294 if (winver.dwMajorVersion >= 6)
7295 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7296 and later */
7297 return WC_ERR_INVALID_CHARS;
7298 else
7299 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7300 return 0;
7301 }
7302 else if (code_page == CP_UTF7) {
7303 /* CP_UTF7 only supports flags=0 */
7304 return 0;
7305 }
7306 else {
7307 if (errors != NULL && strcmp(errors, "replace") == 0)
7308 return 0;
7309 else
7310 return WC_NO_BEST_FIT_CHARS;
7311 }
7312}
7313
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 * Encode a Unicode string to a Windows code page into a byte string in strict
7316 * mode.
7317 *
7318 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7319 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007321static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007322encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007323 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325{
Victor Stinner554f3f02010-06-16 23:33:54 +00007326 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 BOOL *pusedDefaultChar = &usedDefaultChar;
7328 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007329 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007330 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007331 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 const DWORD flags = encode_code_page_flags(code_page, NULL);
7333 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007334 /* Create a substring so that we can get the UTF-16 representation
7335 of just the slice under consideration. */
7336 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Martin v. Löwis3d325192011-11-04 18:23:06 +01007338 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007339
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007341 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007343 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007344
Victor Stinner2fc507f2011-11-04 20:06:39 +01007345 substring = PyUnicode_Substring(unicode, offset, offset+len);
7346 if (substring == NULL)
7347 return -1;
7348 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7349 if (p == NULL) {
7350 Py_DECREF(substring);
7351 return -1;
7352 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007353
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007354 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 outsize = WideCharToMultiByte(code_page, flags,
7356 p, size,
7357 NULL, 0,
7358 NULL, pusedDefaultChar);
7359 if (outsize <= 0)
7360 goto error;
7361 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007362 if (pusedDefaultChar && *pusedDefaultChar) {
7363 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007365 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007366
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007370 if (*outbytes == NULL) {
7371 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007373 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375 }
7376 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007378 const Py_ssize_t n = PyBytes_Size(*outbytes);
7379 if (outsize > PY_SSIZE_T_MAX - n) {
7380 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007381 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007383 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007384 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7385 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007387 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007389 }
7390
7391 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 outsize = WideCharToMultiByte(code_page, flags,
7393 p, size,
7394 out, outsize,
7395 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007396 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 if (outsize <= 0)
7398 goto error;
7399 if (pusedDefaultChar && *pusedDefaultChar)
7400 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007401 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007402
Victor Stinner3a50e702011-10-18 21:21:00 +02007403error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007404 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7406 return -2;
7407 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007408 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007409}
7410
Victor Stinner3a50e702011-10-18 21:21:00 +02007411/*
7412 * Encode a Unicode string to a Windows code page into a byte string using a
7413 * error handler.
7414 *
7415 * Returns consumed characters if succeed, or raise a WindowsError and returns
7416 * -1 on other error.
7417 */
7418static int
7419encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007420 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007421 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007422{
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007424 Py_ssize_t pos = unicode_offset;
7425 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 /* Ideally, we should get reason from FormatMessage. This is the Windows
7427 2000 English version of the message. */
7428 const char *reason = "invalid character";
7429 /* 4=maximum length of a UTF-8 sequence */
7430 char buffer[4];
7431 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7432 Py_ssize_t outsize;
7433 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 PyObject *errorHandler = NULL;
7435 PyObject *exc = NULL;
7436 PyObject *encoding_obj = NULL;
7437 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007438 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 PyObject *rep;
7440 int ret = -1;
7441
7442 assert(insize > 0);
7443
7444 encoding = code_page_name(code_page, &encoding_obj);
7445 if (encoding == NULL)
7446 return -1;
7447
7448 if (errors == NULL || strcmp(errors, "strict") == 0) {
7449 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7450 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007451 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 if (exc != NULL) {
7453 PyCodec_StrictErrors(exc);
7454 Py_DECREF(exc);
7455 }
7456 Py_XDECREF(encoding_obj);
7457 return -1;
7458 }
7459
7460 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7461 pusedDefaultChar = &usedDefaultChar;
7462 else
7463 pusedDefaultChar = NULL;
7464
7465 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7466 PyErr_NoMemory();
7467 goto error;
7468 }
7469 outsize = insize * Py_ARRAY_LENGTH(buffer);
7470
7471 if (*outbytes == NULL) {
7472 /* Create string object */
7473 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7474 if (*outbytes == NULL)
7475 goto error;
7476 out = PyBytes_AS_STRING(*outbytes);
7477 }
7478 else {
7479 /* Extend string object */
7480 Py_ssize_t n = PyBytes_Size(*outbytes);
7481 if (n > PY_SSIZE_T_MAX - outsize) {
7482 PyErr_NoMemory();
7483 goto error;
7484 }
7485 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7486 goto error;
7487 out = PyBytes_AS_STRING(*outbytes) + n;
7488 }
7489
7490 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007491 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7494 wchar_t chars[2];
7495 int charsize;
7496 if (ch < 0x10000) {
7497 chars[0] = (wchar_t)ch;
7498 charsize = 1;
7499 }
7500 else {
7501 ch -= 0x10000;
7502 chars[0] = 0xd800 + (ch >> 10);
7503 chars[1] = 0xdc00 + (ch & 0x3ff);
7504 charsize = 2;
7505 }
7506
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007508 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 buffer, Py_ARRAY_LENGTH(buffer),
7510 NULL, pusedDefaultChar);
7511 if (outsize > 0) {
7512 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7513 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007514 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 memcpy(out, buffer, outsize);
7516 out += outsize;
7517 continue;
7518 }
7519 }
7520 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7521 PyErr_SetFromWindowsErr(0);
7522 goto error;
7523 }
7524
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 rep = unicode_encode_call_errorhandler(
7526 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007527 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007528 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 if (rep == NULL)
7530 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532
7533 if (PyBytes_Check(rep)) {
7534 outsize = PyBytes_GET_SIZE(rep);
7535 if (outsize != 1) {
7536 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7537 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7538 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7539 Py_DECREF(rep);
7540 goto error;
7541 }
7542 out = PyBytes_AS_STRING(*outbytes) + offset;
7543 }
7544 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7545 out += outsize;
7546 }
7547 else {
7548 Py_ssize_t i;
7549 enum PyUnicode_Kind kind;
7550 void *data;
7551
Benjamin Petersonbac79492012-01-14 13:34:47 -05007552 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007553 Py_DECREF(rep);
7554 goto error;
7555 }
7556
7557 outsize = PyUnicode_GET_LENGTH(rep);
7558 if (outsize != 1) {
7559 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7560 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7561 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7562 Py_DECREF(rep);
7563 goto error;
7564 }
7565 out = PyBytes_AS_STRING(*outbytes) + offset;
7566 }
7567 kind = PyUnicode_KIND(rep);
7568 data = PyUnicode_DATA(rep);
7569 for (i=0; i < outsize; i++) {
7570 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7571 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007572 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007573 encoding, unicode,
7574 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007575 "unable to encode error handler result to ASCII");
7576 Py_DECREF(rep);
7577 goto error;
7578 }
7579 *out = (unsigned char)ch;
7580 out++;
7581 }
7582 }
7583 Py_DECREF(rep);
7584 }
7585 /* write a NUL byte */
7586 *out = 0;
7587 outsize = out - PyBytes_AS_STRING(*outbytes);
7588 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7589 if (_PyBytes_Resize(outbytes, outsize) < 0)
7590 goto error;
7591 ret = 0;
7592
7593error:
7594 Py_XDECREF(encoding_obj);
7595 Py_XDECREF(errorHandler);
7596 Py_XDECREF(exc);
7597 return ret;
7598}
7599
Victor Stinner3a50e702011-10-18 21:21:00 +02007600static PyObject *
7601encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007602 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 const char *errors)
7604{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007605 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007607 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007608 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007609
Benjamin Petersonbac79492012-01-14 13:34:47 -05007610 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007611 return NULL;
7612 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007613
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 if (code_page < 0) {
7615 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7616 return NULL;
7617 }
7618
Martin v. Löwis3d325192011-11-04 18:23:06 +01007619 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007620 return PyBytes_FromStringAndSize(NULL, 0);
7621
Victor Stinner7581cef2011-11-03 22:32:33 +01007622 offset = 0;
7623 do
7624 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007625#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007626 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007627 chunks. */
7628 if (len > INT_MAX/2) {
7629 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007630 done = 0;
7631 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007632 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007633#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007634 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007635 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007636 done = 1;
7637 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007638
Victor Stinner76a31a62011-11-04 00:05:13 +01007639 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007640 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007641 errors);
7642 if (ret == -2)
7643 ret = encode_code_page_errors(code_page, &outbytes,
7644 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007645 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007646 if (ret < 0) {
7647 Py_XDECREF(outbytes);
7648 return NULL;
7649 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007650
Victor Stinner7581cef2011-11-03 22:32:33 +01007651 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007652 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007653 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007654
Victor Stinner3a50e702011-10-18 21:21:00 +02007655 return outbytes;
7656}
7657
7658PyObject *
7659PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7660 Py_ssize_t size,
7661 const char *errors)
7662{
Victor Stinner7581cef2011-11-03 22:32:33 +01007663 PyObject *unicode, *res;
7664 unicode = PyUnicode_FromUnicode(p, size);
7665 if (unicode == NULL)
7666 return NULL;
7667 res = encode_code_page(CP_ACP, unicode, errors);
7668 Py_DECREF(unicode);
7669 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007670}
7671
7672PyObject *
7673PyUnicode_EncodeCodePage(int code_page,
7674 PyObject *unicode,
7675 const char *errors)
7676{
Victor Stinner7581cef2011-11-03 22:32:33 +01007677 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007678}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007679
Alexander Belopolsky40018472011-02-26 01:02:56 +00007680PyObject *
7681PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007682{
7683 if (!PyUnicode_Check(unicode)) {
7684 PyErr_BadArgument();
7685 return NULL;
7686 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007687 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007688}
7689
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007690#undef NEED_RETRY
7691
Victor Stinner99b95382011-07-04 14:23:54 +02007692#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007693
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694/* --- Character Mapping Codec -------------------------------------------- */
7695
Alexander Belopolsky40018472011-02-26 01:02:56 +00007696PyObject *
7697PyUnicode_DecodeCharmap(const char *s,
7698 Py_ssize_t size,
7699 PyObject *mapping,
7700 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007702 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007703 Py_ssize_t startinpos;
7704 Py_ssize_t endinpos;
7705 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007706 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007707 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007708 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 PyObject *errorHandler = NULL;
7710 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007711
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712 /* Default to Latin-1 */
7713 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007716 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007720 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007721 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007722 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007723 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007724 Py_ssize_t maplen;
7725 enum PyUnicode_Kind kind;
7726 void *data;
7727 Py_UCS4 x;
7728
Benjamin Petersonbac79492012-01-14 13:34:47 -05007729 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007730 return NULL;
7731
7732 maplen = PyUnicode_GET_LENGTH(mapping);
7733 data = PyUnicode_DATA(mapping);
7734 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 while (s < e) {
7736 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007739 x = PyUnicode_READ(kind, data, ch);
7740 else
7741 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007743 if (x == 0xfffe)
7744 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 startinpos = s-starts;
7747 endinpos = startinpos+1;
7748 if (unicode_decode_call_errorhandler(
7749 errors, &errorHandler,
7750 "charmap", "character maps to <undefined>",
7751 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007752 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 goto onError;
7754 }
7755 continue;
7756 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007757
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007758 if (unicode_putchar(&v, &outpos, x) < 0)
7759 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007761 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007762 }
7763 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 while (s < e) {
7765 unsigned char ch = *s;
7766 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007767
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7769 w = PyLong_FromLong((long)ch);
7770 if (w == NULL)
7771 goto onError;
7772 x = PyObject_GetItem(mapping, w);
7773 Py_DECREF(w);
7774 if (x == NULL) {
7775 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7776 /* No mapping found means: mapping is undefined. */
7777 PyErr_Clear();
7778 x = Py_None;
7779 Py_INCREF(x);
7780 } else
7781 goto onError;
7782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007783
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 /* Apply mapping */
7785 if (PyLong_Check(x)) {
7786 long value = PyLong_AS_LONG(x);
7787 if (value < 0 || value > 65535) {
7788 PyErr_SetString(PyExc_TypeError,
7789 "character mapping must be in range(65536)");
7790 Py_DECREF(x);
7791 goto onError;
7792 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007793 if (unicode_putchar(&v, &outpos, value) < 0)
7794 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 }
7796 else if (x == Py_None) {
7797 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 startinpos = s-starts;
7799 endinpos = startinpos+1;
7800 if (unicode_decode_call_errorhandler(
7801 errors, &errorHandler,
7802 "charmap", "character maps to <undefined>",
7803 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007804 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 Py_DECREF(x);
7806 goto onError;
7807 }
7808 Py_DECREF(x);
7809 continue;
7810 }
7811 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007812 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007813
Benjamin Petersonbac79492012-01-14 13:34:47 -05007814 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007815 goto onError;
7816 targetsize = PyUnicode_GET_LENGTH(x);
7817
7818 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007820 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007821 PyUnicode_READ_CHAR(x, 0)) < 0)
7822 goto onError;
7823 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 else if (targetsize > 1) {
7825 /* 1-n mapping */
7826 if (targetsize > extrachars) {
7827 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 Py_ssize_t needed = (targetsize - extrachars) + \
7829 (targetsize << 2);
7830 extrachars += needed;
7831 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007832 if (unicode_resize(&v,
7833 PyUnicode_GET_LENGTH(v) + needed) < 0)
7834 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 Py_DECREF(x);
7836 goto onError;
7837 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007839 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7840 goto onError;
7841 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7842 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 extrachars -= targetsize;
7844 }
7845 /* 1-0 mapping: skip the character */
7846 }
7847 else {
7848 /* wrong return value */
7849 PyErr_SetString(PyExc_TypeError,
7850 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007851 Py_DECREF(x);
7852 goto onError;
7853 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 Py_DECREF(x);
7855 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007858 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007859 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007860 Py_XDECREF(errorHandler);
7861 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007862 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007863
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007865 Py_XDECREF(errorHandler);
7866 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 Py_XDECREF(v);
7868 return NULL;
7869}
7870
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871/* Charmap encoding: the lookup table */
7872
Alexander Belopolsky40018472011-02-26 01:02:56 +00007873struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 PyObject_HEAD
7875 unsigned char level1[32];
7876 int count2, count3;
7877 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878};
7879
7880static PyObject*
7881encoding_map_size(PyObject *obj, PyObject* args)
7882{
7883 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007884 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886}
7887
7888static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007889 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 PyDoc_STR("Return the size (in bytes) of this object") },
7891 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007892};
7893
7894static void
7895encoding_map_dealloc(PyObject* o)
7896{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007897 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007898}
7899
7900static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007901 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 "EncodingMap", /*tp_name*/
7903 sizeof(struct encoding_map), /*tp_basicsize*/
7904 0, /*tp_itemsize*/
7905 /* methods */
7906 encoding_map_dealloc, /*tp_dealloc*/
7907 0, /*tp_print*/
7908 0, /*tp_getattr*/
7909 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007910 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 0, /*tp_repr*/
7912 0, /*tp_as_number*/
7913 0, /*tp_as_sequence*/
7914 0, /*tp_as_mapping*/
7915 0, /*tp_hash*/
7916 0, /*tp_call*/
7917 0, /*tp_str*/
7918 0, /*tp_getattro*/
7919 0, /*tp_setattro*/
7920 0, /*tp_as_buffer*/
7921 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7922 0, /*tp_doc*/
7923 0, /*tp_traverse*/
7924 0, /*tp_clear*/
7925 0, /*tp_richcompare*/
7926 0, /*tp_weaklistoffset*/
7927 0, /*tp_iter*/
7928 0, /*tp_iternext*/
7929 encoding_map_methods, /*tp_methods*/
7930 0, /*tp_members*/
7931 0, /*tp_getset*/
7932 0, /*tp_base*/
7933 0, /*tp_dict*/
7934 0, /*tp_descr_get*/
7935 0, /*tp_descr_set*/
7936 0, /*tp_dictoffset*/
7937 0, /*tp_init*/
7938 0, /*tp_alloc*/
7939 0, /*tp_new*/
7940 0, /*tp_free*/
7941 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007942};
7943
7944PyObject*
7945PyUnicode_BuildEncodingMap(PyObject* string)
7946{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 PyObject *result;
7948 struct encoding_map *mresult;
7949 int i;
7950 int need_dict = 0;
7951 unsigned char level1[32];
7952 unsigned char level2[512];
7953 unsigned char *mlevel1, *mlevel2, *mlevel3;
7954 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007955 int kind;
7956 void *data;
7957 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007959 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007960 PyErr_BadArgument();
7961 return NULL;
7962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 kind = PyUnicode_KIND(string);
7964 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007965 memset(level1, 0xFF, sizeof level1);
7966 memset(level2, 0xFF, sizeof level2);
7967
7968 /* If there isn't a one-to-one mapping of NULL to \0,
7969 or if there are non-BMP characters, we need to use
7970 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007972 need_dict = 1;
7973 for (i = 1; i < 256; i++) {
7974 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007975 ch = PyUnicode_READ(kind, data, i);
7976 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007977 need_dict = 1;
7978 break;
7979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007980 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007981 /* unmapped character */
7982 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 l1 = ch >> 11;
7984 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007985 if (level1[l1] == 0xFF)
7986 level1[l1] = count2++;
7987 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007988 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007989 }
7990
7991 if (count2 >= 0xFF || count3 >= 0xFF)
7992 need_dict = 1;
7993
7994 if (need_dict) {
7995 PyObject *result = PyDict_New();
7996 PyObject *key, *value;
7997 if (!result)
7998 return NULL;
7999 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008000 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008001 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002 if (!key || !value)
8003 goto failed1;
8004 if (PyDict_SetItem(result, key, value) == -1)
8005 goto failed1;
8006 Py_DECREF(key);
8007 Py_DECREF(value);
8008 }
8009 return result;
8010 failed1:
8011 Py_XDECREF(key);
8012 Py_XDECREF(value);
8013 Py_DECREF(result);
8014 return NULL;
8015 }
8016
8017 /* Create a three-level trie */
8018 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8019 16*count2 + 128*count3 - 1);
8020 if (!result)
8021 return PyErr_NoMemory();
8022 PyObject_Init(result, &EncodingMapType);
8023 mresult = (struct encoding_map*)result;
8024 mresult->count2 = count2;
8025 mresult->count3 = count3;
8026 mlevel1 = mresult->level1;
8027 mlevel2 = mresult->level23;
8028 mlevel3 = mresult->level23 + 16*count2;
8029 memcpy(mlevel1, level1, 32);
8030 memset(mlevel2, 0xFF, 16*count2);
8031 memset(mlevel3, 0, 128*count3);
8032 count3 = 0;
8033 for (i = 1; i < 256; i++) {
8034 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008035 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008036 /* unmapped character */
8037 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 o1 = PyUnicode_READ(kind, data, i)>>11;
8039 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008040 i2 = 16*mlevel1[o1] + o2;
8041 if (mlevel2[i2] == 0xFF)
8042 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008043 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044 i3 = 128*mlevel2[i2] + o3;
8045 mlevel3[i3] = i;
8046 }
8047 return result;
8048}
8049
8050static int
Victor Stinner22168992011-11-20 17:09:18 +01008051encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008052{
8053 struct encoding_map *map = (struct encoding_map*)mapping;
8054 int l1 = c>>11;
8055 int l2 = (c>>7) & 0xF;
8056 int l3 = c & 0x7F;
8057 int i;
8058
Victor Stinner22168992011-11-20 17:09:18 +01008059 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061 if (c == 0)
8062 return 0;
8063 /* level 1*/
8064 i = map->level1[l1];
8065 if (i == 0xFF) {
8066 return -1;
8067 }
8068 /* level 2*/
8069 i = map->level23[16*i+l2];
8070 if (i == 0xFF) {
8071 return -1;
8072 }
8073 /* level 3 */
8074 i = map->level23[16*map->count2 + 128*i + l3];
8075 if (i == 0) {
8076 return -1;
8077 }
8078 return i;
8079}
8080
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081/* Lookup the character ch in the mapping. If the character
8082 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008083 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008084static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008085charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086{
Christian Heimes217cfd12007-12-02 14:31:20 +00008087 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 PyObject *x;
8089
8090 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 x = PyObject_GetItem(mapping, w);
8093 Py_DECREF(w);
8094 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8096 /* No mapping found means: mapping is undefined. */
8097 PyErr_Clear();
8098 x = Py_None;
8099 Py_INCREF(x);
8100 return x;
8101 } else
8102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008104 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008106 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 long value = PyLong_AS_LONG(x);
8108 if (value < 0 || value > 255) {
8109 PyErr_SetString(PyExc_TypeError,
8110 "character mapping must be in range(256)");
8111 Py_DECREF(x);
8112 return NULL;
8113 }
8114 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008116 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 /* wrong return value */
8120 PyErr_Format(PyExc_TypeError,
8121 "character mapping must return integer, bytes or None, not %.400s",
8122 x->ob_type->tp_name);
8123 Py_DECREF(x);
8124 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125 }
8126}
8127
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008128static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008129charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008130{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008131 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8132 /* exponentially overallocate to minimize reallocations */
8133 if (requiredsize < 2*outsize)
8134 requiredsize = 2*outsize;
8135 if (_PyBytes_Resize(outobj, requiredsize))
8136 return -1;
8137 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138}
8139
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008142} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008144 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 space is available. Return a new reference to the object that
8146 was put in the output buffer, or Py_None, if the mapping was undefined
8147 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008148 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008149static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008150charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008151 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008153 PyObject *rep;
8154 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008155 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008156
Christian Heimes90aa7642007-12-19 02:45:37 +00008157 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 if (res == -1)
8161 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 if (outsize<requiredsize)
8163 if (charmapencode_resize(outobj, outpos, requiredsize))
8164 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008165 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 outstart[(*outpos)++] = (char)res;
8167 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008168 }
8169
8170 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008171 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008173 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 Py_DECREF(rep);
8175 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 if (PyLong_Check(rep)) {
8178 Py_ssize_t requiredsize = *outpos+1;
8179 if (outsize<requiredsize)
8180 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8181 Py_DECREF(rep);
8182 return enc_EXCEPTION;
8183 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008184 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 else {
8188 const char *repchars = PyBytes_AS_STRING(rep);
8189 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8190 Py_ssize_t requiredsize = *outpos+repsize;
8191 if (outsize<requiredsize)
8192 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8193 Py_DECREF(rep);
8194 return enc_EXCEPTION;
8195 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008196 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 memcpy(outstart + *outpos, repchars, repsize);
8198 *outpos += repsize;
8199 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008200 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008201 Py_DECREF(rep);
8202 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203}
8204
8205/* handle an error in PyUnicode_EncodeCharmap
8206 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008207static int
8208charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008209 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008211 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008212 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213{
8214 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008215 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008216 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008217 enum PyUnicode_Kind kind;
8218 void *data;
8219 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008221 Py_ssize_t collstartpos = *inpos;
8222 Py_ssize_t collendpos = *inpos+1;
8223 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008224 char *encoding = "charmap";
8225 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008227 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008228 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229
Benjamin Petersonbac79492012-01-14 13:34:47 -05008230 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008231 return -1;
8232 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008233 /* find all unencodable characters */
8234 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008235 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008236 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008237 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008238 val = encoding_map_lookup(ch, mapping);
8239 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 break;
8241 ++collendpos;
8242 continue;
8243 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008244
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008245 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8246 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 if (rep==NULL)
8248 return -1;
8249 else if (rep!=Py_None) {
8250 Py_DECREF(rep);
8251 break;
8252 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008253 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255 }
8256 /* cache callback name lookup
8257 * (if not done yet, i.e. it's the first error) */
8258 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 if ((errors==NULL) || (!strcmp(errors, "strict")))
8260 *known_errorHandler = 1;
8261 else if (!strcmp(errors, "replace"))
8262 *known_errorHandler = 2;
8263 else if (!strcmp(errors, "ignore"))
8264 *known_errorHandler = 3;
8265 else if (!strcmp(errors, "xmlcharrefreplace"))
8266 *known_errorHandler = 4;
8267 else
8268 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 }
8270 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008272 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008273 return -1;
8274 case 2: /* replace */
8275 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 x = charmapencode_output('?', mapping, res, respos);
8277 if (x==enc_EXCEPTION) {
8278 return -1;
8279 }
8280 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008281 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 return -1;
8283 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008284 }
8285 /* fall through */
8286 case 3: /* ignore */
8287 *inpos = collendpos;
8288 break;
8289 case 4: /* xmlcharrefreplace */
8290 /* generate replacement (temporarily (mis)uses p) */
8291 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 char buffer[2+29+1+1];
8293 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008294 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 for (cp = buffer; *cp; ++cp) {
8296 x = charmapencode_output(*cp, mapping, res, respos);
8297 if (x==enc_EXCEPTION)
8298 return -1;
8299 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008300 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 return -1;
8302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008303 }
8304 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008305 *inpos = collendpos;
8306 break;
8307 default:
8308 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008309 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008311 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008313 if (PyBytes_Check(repunicode)) {
8314 /* Directly copy bytes result to output. */
8315 Py_ssize_t outsize = PyBytes_Size(*res);
8316 Py_ssize_t requiredsize;
8317 repsize = PyBytes_Size(repunicode);
8318 requiredsize = *respos + repsize;
8319 if (requiredsize > outsize)
8320 /* Make room for all additional bytes. */
8321 if (charmapencode_resize(res, respos, requiredsize)) {
8322 Py_DECREF(repunicode);
8323 return -1;
8324 }
8325 memcpy(PyBytes_AsString(*res) + *respos,
8326 PyBytes_AsString(repunicode), repsize);
8327 *respos += repsize;
8328 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008329 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008330 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008331 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008332 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008333 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008334 Py_DECREF(repunicode);
8335 return -1;
8336 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008337 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008338 data = PyUnicode_DATA(repunicode);
8339 kind = PyUnicode_KIND(repunicode);
8340 for (index = 0; index < repsize; index++) {
8341 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8342 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008344 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 return -1;
8346 }
8347 else if (x==enc_FAILED) {
8348 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008349 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 return -1;
8351 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008352 }
8353 *inpos = newpos;
8354 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 }
8356 return 0;
8357}
8358
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008360_PyUnicode_EncodeCharmap(PyObject *unicode,
8361 PyObject *mapping,
8362 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364 /* output object */
8365 PyObject *res = NULL;
8366 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008367 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008368 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008370 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 PyObject *errorHandler = NULL;
8372 PyObject *exc = NULL;
8373 /* the following variable is used for caching string comparisons
8374 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8375 * 3=ignore, 4=xmlcharrefreplace */
8376 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377
Benjamin Petersonbac79492012-01-14 13:34:47 -05008378 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008379 return NULL;
8380 size = PyUnicode_GET_LENGTH(unicode);
8381
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 /* Default to Latin-1 */
8383 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008384 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 /* allocate enough for a simple encoding without
8387 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008388 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 if (res == NULL)
8390 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008391 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008395 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008397 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 if (x==enc_EXCEPTION) /* error */
8399 goto onError;
8400 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008401 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 &exc,
8403 &known_errorHandler, &errorHandler, errors,
8404 &res, &respos)) {
8405 goto onError;
8406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008407 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 else
8409 /* done with this character => adjust input position */
8410 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008414 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008415 if (_PyBytes_Resize(&res, respos) < 0)
8416 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 Py_XDECREF(exc);
8419 Py_XDECREF(errorHandler);
8420 return res;
8421
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 Py_XDECREF(res);
8424 Py_XDECREF(exc);
8425 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 return NULL;
8427}
8428
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008429/* Deprecated */
8430PyObject *
8431PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8432 Py_ssize_t size,
8433 PyObject *mapping,
8434 const char *errors)
8435{
8436 PyObject *result;
8437 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8438 if (unicode == NULL)
8439 return NULL;
8440 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8441 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008442 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008443}
8444
Alexander Belopolsky40018472011-02-26 01:02:56 +00008445PyObject *
8446PyUnicode_AsCharmapString(PyObject *unicode,
8447 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448{
8449 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 PyErr_BadArgument();
8451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008453 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454}
8455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008456/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008457static void
8458make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008460 Py_ssize_t startpos, Py_ssize_t endpos,
8461 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 *exceptionObject = _PyUnicodeTranslateError_Create(
8465 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 }
8467 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8469 goto onError;
8470 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8471 goto onError;
8472 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8473 goto onError;
8474 return;
8475 onError:
8476 Py_DECREF(*exceptionObject);
8477 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 }
8479}
8480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008481/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008482static void
8483raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008485 Py_ssize_t startpos, Py_ssize_t endpos,
8486 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487{
8488 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492}
8493
8494/* error handling callback helper:
8495 build arguments, call the callback and check the arguments,
8496 put the result into newpos and return the replacement string, which
8497 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008498static PyObject *
8499unicode_translate_call_errorhandler(const char *errors,
8500 PyObject **errorHandler,
8501 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008503 Py_ssize_t startpos, Py_ssize_t endpos,
8504 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008506 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008508 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008509 PyObject *restuple;
8510 PyObject *resunicode;
8511
8512 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516 }
8517
8518 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008522
8523 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008528 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 Py_DECREF(restuple);
8530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 }
8532 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 &resunicode, &i_newpos)) {
8534 Py_DECREF(restuple);
8535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008536 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008537 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008539 else
8540 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8543 Py_DECREF(restuple);
8544 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008545 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 Py_INCREF(resunicode);
8547 Py_DECREF(restuple);
8548 return resunicode;
8549}
8550
8551/* Lookup the character ch in the mapping and put the result in result,
8552 which must be decrefed by the caller.
8553 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008554static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556{
Christian Heimes217cfd12007-12-02 14:31:20 +00008557 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 PyObject *x;
8559
8560 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 x = PyObject_GetItem(mapping, w);
8563 Py_DECREF(w);
8564 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8566 /* No mapping found means: use 1:1 mapping. */
8567 PyErr_Clear();
8568 *result = NULL;
8569 return 0;
8570 } else
8571 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 }
8573 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 *result = x;
8575 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008577 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 long value = PyLong_AS_LONG(x);
8579 long max = PyUnicode_GetMax();
8580 if (value < 0 || value > max) {
8581 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008582 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 Py_DECREF(x);
8584 return -1;
8585 }
8586 *result = x;
8587 return 0;
8588 }
8589 else if (PyUnicode_Check(x)) {
8590 *result = x;
8591 return 0;
8592 }
8593 else {
8594 /* wrong return value */
8595 PyErr_SetString(PyExc_TypeError,
8596 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008597 Py_DECREF(x);
8598 return -1;
8599 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600}
8601/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 if not reallocate and adjust various state variables.
8603 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008604static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008609 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 /* exponentially overallocate to minimize reallocations */
8611 if (requiredsize < 2 * oldsize)
8612 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8614 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008617 }
8618 return 0;
8619}
8620/* lookup the character, put the result in the output string and adjust
8621 various state variables. Return a new reference to the object that
8622 was put in the output buffer in *result, or Py_None, if the mapping was
8623 undefined (in which case no character was written).
8624 The called must decref result.
8625 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008626static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8628 PyObject *mapping, Py_UCS4 **output,
8629 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008630 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8633 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 }
8639 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008641 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 }
8645 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 Py_ssize_t repsize;
8647 if (PyUnicode_READY(*res) == -1)
8648 return -1;
8649 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 if (repsize==1) {
8651 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 }
8654 else if (repsize!=0) {
8655 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 Py_ssize_t requiredsize = *opos +
8657 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 Py_ssize_t i;
8660 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 for(i = 0; i < repsize; i++)
8663 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 }
8666 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 return 0;
8669}
8670
Alexander Belopolsky40018472011-02-26 01:02:56 +00008671PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672_PyUnicode_TranslateCharmap(PyObject *input,
8673 PyObject *mapping,
8674 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 /* input object */
8677 char *idata;
8678 Py_ssize_t size, i;
8679 int kind;
8680 /* output buffer */
8681 Py_UCS4 *output = NULL;
8682 Py_ssize_t osize;
8683 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 char *reason = "character maps to <undefined>";
8687 PyObject *errorHandler = NULL;
8688 PyObject *exc = NULL;
8689 /* the following variable is used for caching string comparisons
8690 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8691 * 3=ignore, 4=xmlcharrefreplace */
8692 int known_errorHandler = -1;
8693
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 PyErr_BadArgument();
8696 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 if (PyUnicode_READY(input) == -1)
8700 return NULL;
8701 idata = (char*)PyUnicode_DATA(input);
8702 kind = PyUnicode_KIND(input);
8703 size = PyUnicode_GET_LENGTH(input);
8704 i = 0;
8705
8706 if (size == 0) {
8707 Py_INCREF(input);
8708 return input;
8709 }
8710
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 /* allocate enough for a simple 1:1 translation without
8712 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 osize = size;
8714 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8715 opos = 0;
8716 if (output == NULL) {
8717 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008721 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 /* try to encode it */
8723 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 if (charmaptranslate_output(input, i, mapping,
8725 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 Py_XDECREF(x);
8727 goto onError;
8728 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008729 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 else { /* untranslatable character */
8733 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8734 Py_ssize_t repsize;
8735 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 Py_ssize_t collstart = i;
8739 Py_ssize_t collend = i+1;
8740 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743 while (collend < size) {
8744 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 goto onError;
8746 Py_XDECREF(x);
8747 if (x!=Py_None)
8748 break;
8749 ++collend;
8750 }
8751 /* cache callback name lookup
8752 * (if not done yet, i.e. it's the first error) */
8753 if (known_errorHandler==-1) {
8754 if ((errors==NULL) || (!strcmp(errors, "strict")))
8755 known_errorHandler = 1;
8756 else if (!strcmp(errors, "replace"))
8757 known_errorHandler = 2;
8758 else if (!strcmp(errors, "ignore"))
8759 known_errorHandler = 3;
8760 else if (!strcmp(errors, "xmlcharrefreplace"))
8761 known_errorHandler = 4;
8762 else
8763 known_errorHandler = 0;
8764 }
8765 switch (known_errorHandler) {
8766 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 raise_translate_exception(&exc, input, collstart,
8768 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008769 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 case 2: /* replace */
8771 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 for (coll = collstart; coll<collend; coll++)
8773 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 /* fall through */
8775 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 break;
8778 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779 /* generate replacement (temporarily (mis)uses i) */
8780 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 char buffer[2+29+1+1];
8782 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8784 if (charmaptranslate_makespace(&output, &osize,
8785 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 goto onError;
8787 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 break;
8792 default:
8793 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 reason, input, &exc,
8795 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008796 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008798 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008799 Py_DECREF(repunicode);
8800 goto onError;
8801 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 repsize = PyUnicode_GET_LENGTH(repunicode);
8804 if (charmaptranslate_makespace(&output, &osize,
8805 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 Py_DECREF(repunicode);
8807 goto onError;
8808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 for (uni2 = 0; repsize-->0; ++uni2)
8810 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8811 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008813 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008814 }
8815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8817 if (!res)
8818 goto onError;
8819 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820 Py_XDECREF(exc);
8821 Py_XDECREF(errorHandler);
8822 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008826 Py_XDECREF(exc);
8827 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828 return NULL;
8829}
8830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831/* Deprecated. Use PyUnicode_Translate instead. */
8832PyObject *
8833PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8834 Py_ssize_t size,
8835 PyObject *mapping,
8836 const char *errors)
8837{
8838 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8839 if (!unicode)
8840 return NULL;
8841 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8842}
8843
Alexander Belopolsky40018472011-02-26 01:02:56 +00008844PyObject *
8845PyUnicode_Translate(PyObject *str,
8846 PyObject *mapping,
8847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848{
8849 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008850
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 str = PyUnicode_FromObject(str);
8852 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 Py_DECREF(str);
8856 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008857
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 Py_XDECREF(str);
8860 return NULL;
8861}
Tim Petersced69f82003-09-16 20:30:58 +00008862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008864fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008865{
8866 /* No need to call PyUnicode_READY(self) because this function is only
8867 called as a callback from fixup() which does it already. */
8868 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8869 const int kind = PyUnicode_KIND(self);
8870 void *data = PyUnicode_DATA(self);
8871 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008872 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 Py_ssize_t i;
8874
8875 for (i = 0; i < len; ++i) {
8876 ch = PyUnicode_READ(kind, data, i);
8877 fixed = 0;
8878 if (ch > 127) {
8879 if (Py_UNICODE_ISSPACE(ch))
8880 fixed = ' ';
8881 else {
8882 const int decimal = Py_UNICODE_TODECIMAL(ch);
8883 if (decimal >= 0)
8884 fixed = '0' + decimal;
8885 }
8886 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008887 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 if (fixed > maxchar)
8889 maxchar = fixed;
8890 PyUnicode_WRITE(kind, data, i, fixed);
8891 }
8892 else if (ch > maxchar)
8893 maxchar = ch;
8894 }
8895 else if (ch > maxchar)
8896 maxchar = ch;
8897 }
8898
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008899 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900}
8901
8902PyObject *
8903_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8904{
8905 if (!PyUnicode_Check(unicode)) {
8906 PyErr_BadInternalCall();
8907 return NULL;
8908 }
8909 if (PyUnicode_READY(unicode) == -1)
8910 return NULL;
8911 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8912 /* If the string is already ASCII, just return the same string */
8913 Py_INCREF(unicode);
8914 return unicode;
8915 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008916 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917}
8918
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008919PyObject *
8920PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8921 Py_ssize_t length)
8922{
Victor Stinnerf0124502011-11-21 23:12:56 +01008923 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008924 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008925 Py_UCS4 maxchar;
8926 enum PyUnicode_Kind kind;
8927 void *data;
8928
8929 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008930 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008931 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008932 if (ch > 127) {
8933 int decimal = Py_UNICODE_TODECIMAL(ch);
8934 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008935 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008936 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008937 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008938 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008939
8940 /* Copy to a new string */
8941 decimal = PyUnicode_New(length, maxchar);
8942 if (decimal == NULL)
8943 return decimal;
8944 kind = PyUnicode_KIND(decimal);
8945 data = PyUnicode_DATA(decimal);
8946 /* Iterate over code points */
8947 for (i = 0; i < length; i++) {
8948 Py_UNICODE ch = s[i];
8949 if (ch > 127) {
8950 int decimal = Py_UNICODE_TODECIMAL(ch);
8951 if (decimal >= 0)
8952 ch = '0' + decimal;
8953 }
8954 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008956 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008957}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008958/* --- Decimal Encoder ---------------------------------------------------- */
8959
Alexander Belopolsky40018472011-02-26 01:02:56 +00008960int
8961PyUnicode_EncodeDecimal(Py_UNICODE *s,
8962 Py_ssize_t length,
8963 char *output,
8964 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008965{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008966 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008967 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008968 enum PyUnicode_Kind kind;
8969 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008970
8971 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 PyErr_BadArgument();
8973 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008974 }
8975
Victor Stinner42bf7752011-11-21 22:52:58 +01008976 unicode = PyUnicode_FromUnicode(s, length);
8977 if (unicode == NULL)
8978 return -1;
8979
Benjamin Petersonbac79492012-01-14 13:34:47 -05008980 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008981 Py_DECREF(unicode);
8982 return -1;
8983 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008984 kind = PyUnicode_KIND(unicode);
8985 data = PyUnicode_DATA(unicode);
8986
Victor Stinnerb84d7232011-11-22 01:50:07 +01008987 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008988 PyObject *exc;
8989 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008991 Py_ssize_t startpos;
8992
8993 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008994
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008996 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008997 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008999 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 decimal = Py_UNICODE_TODECIMAL(ch);
9001 if (decimal >= 0) {
9002 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009003 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 continue;
9005 }
9006 if (0 < ch && ch < 256) {
9007 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009008 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 continue;
9010 }
Victor Stinner6345be92011-11-25 20:09:01 +01009011
Victor Stinner42bf7752011-11-21 22:52:58 +01009012 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009013 exc = NULL;
9014 raise_encode_exception(&exc, "decimal", unicode,
9015 startpos, startpos+1,
9016 "invalid decimal Unicode string");
9017 Py_XDECREF(exc);
9018 Py_DECREF(unicode);
9019 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009020 }
9021 /* 0-terminate the output string */
9022 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009023 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009024 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009025}
9026
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027/* --- Helpers ------------------------------------------------------------ */
9028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009029static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009030any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 Py_ssize_t start,
9032 Py_ssize_t end)
9033{
9034 int kind1, kind2, kind;
9035 void *buf1, *buf2;
9036 Py_ssize_t len1, len2, result;
9037
9038 kind1 = PyUnicode_KIND(s1);
9039 kind2 = PyUnicode_KIND(s2);
9040 kind = kind1 > kind2 ? kind1 : kind2;
9041 buf1 = PyUnicode_DATA(s1);
9042 buf2 = PyUnicode_DATA(s2);
9043 if (kind1 != kind)
9044 buf1 = _PyUnicode_AsKind(s1, kind);
9045 if (!buf1)
9046 return -2;
9047 if (kind2 != kind)
9048 buf2 = _PyUnicode_AsKind(s2, kind);
9049 if (!buf2) {
9050 if (kind1 != kind) PyMem_Free(buf1);
9051 return -2;
9052 }
9053 len1 = PyUnicode_GET_LENGTH(s1);
9054 len2 = PyUnicode_GET_LENGTH(s2);
9055
Victor Stinner794d5672011-10-10 03:21:36 +02009056 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009057 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009058 case PyUnicode_1BYTE_KIND:
9059 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9060 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9061 else
9062 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9063 break;
9064 case PyUnicode_2BYTE_KIND:
9065 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9066 break;
9067 case PyUnicode_4BYTE_KIND:
9068 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9069 break;
9070 default:
9071 assert(0); result = -2;
9072 }
9073 }
9074 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009075 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009076 case PyUnicode_1BYTE_KIND:
9077 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9078 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9079 else
9080 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9081 break;
9082 case PyUnicode_2BYTE_KIND:
9083 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9084 break;
9085 case PyUnicode_4BYTE_KIND:
9086 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9087 break;
9088 default:
9089 assert(0); result = -2;
9090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 }
9092
9093 if (kind1 != kind)
9094 PyMem_Free(buf1);
9095 if (kind2 != kind)
9096 PyMem_Free(buf2);
9097
9098 return result;
9099}
9100
9101Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009102_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103 Py_ssize_t n_buffer,
9104 void *digits, Py_ssize_t n_digits,
9105 Py_ssize_t min_width,
9106 const char *grouping,
9107 const char *thousands_sep)
9108{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009109 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009111 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9112 return _PyUnicode_ascii_InsertThousandsGrouping(
9113 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9114 min_width, grouping, thousands_sep);
9115 else
9116 return _PyUnicode_ucs1_InsertThousandsGrouping(
9117 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9118 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 case PyUnicode_2BYTE_KIND:
9120 return _PyUnicode_ucs2_InsertThousandsGrouping(
9121 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9122 min_width, grouping, thousands_sep);
9123 case PyUnicode_4BYTE_KIND:
9124 return _PyUnicode_ucs4_InsertThousandsGrouping(
9125 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9126 min_width, grouping, thousands_sep);
9127 }
9128 assert(0);
9129 return -1;
9130}
9131
9132
Thomas Wouters477c8d52006-05-27 19:21:47 +00009133/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009134#define ADJUST_INDICES(start, end, len) \
9135 if (end > len) \
9136 end = len; \
9137 else if (end < 0) { \
9138 end += len; \
9139 if (end < 0) \
9140 end = 0; \
9141 } \
9142 if (start < 0) { \
9143 start += len; \
9144 if (start < 0) \
9145 start = 0; \
9146 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009147
Alexander Belopolsky40018472011-02-26 01:02:56 +00009148Py_ssize_t
9149PyUnicode_Count(PyObject *str,
9150 PyObject *substr,
9151 Py_ssize_t start,
9152 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009154 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009155 PyObject* str_obj;
9156 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157 int kind1, kind2, kind;
9158 void *buf1 = NULL, *buf2 = NULL;
9159 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009160
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009161 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009162 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009164 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009165 if (!sub_obj) {
9166 Py_DECREF(str_obj);
9167 return -1;
9168 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009169 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009170 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 Py_DECREF(str_obj);
9172 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 }
Tim Petersced69f82003-09-16 20:30:58 +00009174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 kind1 = PyUnicode_KIND(str_obj);
9176 kind2 = PyUnicode_KIND(sub_obj);
9177 kind = kind1 > kind2 ? kind1 : kind2;
9178 buf1 = PyUnicode_DATA(str_obj);
9179 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009180 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 if (!buf1)
9182 goto onError;
9183 buf2 = PyUnicode_DATA(sub_obj);
9184 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009185 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 if (!buf2)
9187 goto onError;
9188 len1 = PyUnicode_GET_LENGTH(str_obj);
9189 len2 = PyUnicode_GET_LENGTH(sub_obj);
9190
9191 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009192 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009194 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9195 result = asciilib_count(
9196 ((Py_UCS1*)buf1) + start, end - start,
9197 buf2, len2, PY_SSIZE_T_MAX
9198 );
9199 else
9200 result = ucs1lib_count(
9201 ((Py_UCS1*)buf1) + start, end - start,
9202 buf2, len2, PY_SSIZE_T_MAX
9203 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204 break;
9205 case PyUnicode_2BYTE_KIND:
9206 result = ucs2lib_count(
9207 ((Py_UCS2*)buf1) + start, end - start,
9208 buf2, len2, PY_SSIZE_T_MAX
9209 );
9210 break;
9211 case PyUnicode_4BYTE_KIND:
9212 result = ucs4lib_count(
9213 ((Py_UCS4*)buf1) + start, end - start,
9214 buf2, len2, PY_SSIZE_T_MAX
9215 );
9216 break;
9217 default:
9218 assert(0); result = 0;
9219 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009220
9221 Py_DECREF(sub_obj);
9222 Py_DECREF(str_obj);
9223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 if (kind1 != kind)
9225 PyMem_Free(buf1);
9226 if (kind2 != kind)
9227 PyMem_Free(buf2);
9228
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 onError:
9231 Py_DECREF(sub_obj);
9232 Py_DECREF(str_obj);
9233 if (kind1 != kind && buf1)
9234 PyMem_Free(buf1);
9235 if (kind2 != kind && buf2)
9236 PyMem_Free(buf2);
9237 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238}
9239
Alexander Belopolsky40018472011-02-26 01:02:56 +00009240Py_ssize_t
9241PyUnicode_Find(PyObject *str,
9242 PyObject *sub,
9243 Py_ssize_t start,
9244 Py_ssize_t end,
9245 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009247 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009248
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009250 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009251 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009252 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009253 if (!sub) {
9254 Py_DECREF(str);
9255 return -2;
9256 }
9257 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9258 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009259 Py_DECREF(str);
9260 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261 }
Tim Petersced69f82003-09-16 20:30:58 +00009262
Victor Stinner794d5672011-10-10 03:21:36 +02009263 result = any_find_slice(direction,
9264 str, sub, start, end
9265 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009266
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009268 Py_DECREF(sub);
9269
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 return result;
9271}
9272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273Py_ssize_t
9274PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9275 Py_ssize_t start, Py_ssize_t end,
9276 int direction)
9277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009279 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 if (PyUnicode_READY(str) == -1)
9281 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009282 if (start < 0 || end < 0) {
9283 PyErr_SetString(PyExc_IndexError, "string index out of range");
9284 return -2;
9285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 if (end > PyUnicode_GET_LENGTH(str))
9287 end = PyUnicode_GET_LENGTH(str);
9288 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009289 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9290 kind, end-start, ch, direction);
9291 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009293 else
9294 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295}
9296
Alexander Belopolsky40018472011-02-26 01:02:56 +00009297static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009298tailmatch(PyObject *self,
9299 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009300 Py_ssize_t start,
9301 Py_ssize_t end,
9302 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 int kind_self;
9305 int kind_sub;
9306 void *data_self;
9307 void *data_sub;
9308 Py_ssize_t offset;
9309 Py_ssize_t i;
9310 Py_ssize_t end_sub;
9311
9312 if (PyUnicode_READY(self) == -1 ||
9313 PyUnicode_READY(substring) == -1)
9314 return 0;
9315
9316 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317 return 1;
9318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9320 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 kind_self = PyUnicode_KIND(self);
9325 data_self = PyUnicode_DATA(self);
9326 kind_sub = PyUnicode_KIND(substring);
9327 data_sub = PyUnicode_DATA(substring);
9328 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9329
9330 if (direction > 0)
9331 offset = end;
9332 else
9333 offset = start;
9334
9335 if (PyUnicode_READ(kind_self, data_self, offset) ==
9336 PyUnicode_READ(kind_sub, data_sub, 0) &&
9337 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9338 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9339 /* If both are of the same kind, memcmp is sufficient */
9340 if (kind_self == kind_sub) {
9341 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009342 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 data_sub,
9344 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009345 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 }
9347 /* otherwise we have to compare each character by first accesing it */
9348 else {
9349 /* We do not need to compare 0 and len(substring)-1 because
9350 the if statement above ensured already that they are equal
9351 when we end up here. */
9352 // TODO: honor direction and do a forward or backwards search
9353 for (i = 1; i < end_sub; ++i) {
9354 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9355 PyUnicode_READ(kind_sub, data_sub, i))
9356 return 0;
9357 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009358 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 }
9361
9362 return 0;
9363}
9364
Alexander Belopolsky40018472011-02-26 01:02:56 +00009365Py_ssize_t
9366PyUnicode_Tailmatch(PyObject *str,
9367 PyObject *substr,
9368 Py_ssize_t start,
9369 Py_ssize_t end,
9370 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009372 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009373
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374 str = PyUnicode_FromObject(str);
9375 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 substr = PyUnicode_FromObject(substr);
9378 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 Py_DECREF(str);
9380 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 }
Tim Petersced69f82003-09-16 20:30:58 +00009382
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009383 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385 Py_DECREF(str);
9386 Py_DECREF(substr);
9387 return result;
9388}
9389
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390/* Apply fixfct filter to the Unicode object self and return a
9391 reference to the modified object */
9392
Alexander Belopolsky40018472011-02-26 01:02:56 +00009393static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009394fixup(PyObject *self,
9395 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 PyObject *u;
9398 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009399 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009401 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009403 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009404 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 /* fix functions return the new maximum character in a string,
9407 if the kind of the resulting unicode object does not change,
9408 everything is fine. Otherwise we need to change the string kind
9409 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009410 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009411
9412 if (maxchar_new == 0) {
9413 /* no changes */;
9414 if (PyUnicode_CheckExact(self)) {
9415 Py_DECREF(u);
9416 Py_INCREF(self);
9417 return self;
9418 }
9419 else
9420 return u;
9421 }
9422
9423 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 maxchar_new = 127;
9425 else if (maxchar_new <= 255)
9426 maxchar_new = 255;
9427 else if (maxchar_new <= 65535)
9428 maxchar_new = 65535;
9429 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009430 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431
Victor Stinnereaab6042011-12-11 22:22:39 +01009432 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009434
9435 /* In case the maximum character changed, we need to
9436 convert the string to the new category. */
9437 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9438 if (v == NULL) {
9439 Py_DECREF(u);
9440 return NULL;
9441 }
9442 if (maxchar_new > maxchar_old) {
9443 /* If the maxchar increased so that the kind changed, not all
9444 characters are representable anymore and we need to fix the
9445 string again. This only happens in very few cases. */
9446 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9447 maxchar_old = fixfct(v);
9448 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 }
9450 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009451 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009453 Py_DECREF(u);
9454 assert(_PyUnicode_CheckConsistency(v, 1));
9455 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456}
9457
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009458static PyObject *
9459ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009461 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9462 char *resdata, *data = PyUnicode_DATA(self);
9463 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009464
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009465 res = PyUnicode_New(len, 127);
9466 if (res == NULL)
9467 return NULL;
9468 resdata = PyUnicode_DATA(res);
9469 if (lower)
9470 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009472 _Py_bytes_upper(resdata, data, len);
9473 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474}
9475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009477handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009479 Py_ssize_t j;
9480 int final_sigma;
9481 Py_UCS4 c;
9482 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009483
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009484 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9485
9486 where ! is a negation and \p{xxx} is a character with property xxx.
9487 */
9488 for (j = i - 1; j >= 0; j--) {
9489 c = PyUnicode_READ(kind, data, j);
9490 if (!_PyUnicode_IsCaseIgnorable(c))
9491 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009493 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9494 if (final_sigma) {
9495 for (j = i + 1; j < length; j++) {
9496 c = PyUnicode_READ(kind, data, j);
9497 if (!_PyUnicode_IsCaseIgnorable(c))
9498 break;
9499 }
9500 final_sigma = j == length || !_PyUnicode_IsCased(c);
9501 }
9502 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503}
9504
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505static int
9506lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9507 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009509 /* Obscure special case. */
9510 if (c == 0x3A3) {
9511 mapped[0] = handle_capital_sigma(kind, data, length, i);
9512 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009514 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515}
9516
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009517static Py_ssize_t
9518do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009520 Py_ssize_t i, k = 0;
9521 int n_res, j;
9522 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009523
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009524 c = PyUnicode_READ(kind, data, 0);
9525 n_res = _PyUnicode_ToUpperFull(c, mapped);
9526 for (j = 0; j < n_res; j++) {
9527 if (mapped[j] > *maxchar)
9528 *maxchar = mapped[j];
9529 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009531 for (i = 1; i < length; i++) {
9532 c = PyUnicode_READ(kind, data, i);
9533 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9534 for (j = 0; j < n_res; j++) {
9535 if (mapped[j] > *maxchar)
9536 *maxchar = mapped[j];
9537 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009538 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009539 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009540 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541}
9542
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009543static Py_ssize_t
9544do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9545 Py_ssize_t i, k = 0;
9546
9547 for (i = 0; i < length; i++) {
9548 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9549 int n_res, j;
9550 if (Py_UNICODE_ISUPPER(c)) {
9551 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9552 }
9553 else if (Py_UNICODE_ISLOWER(c)) {
9554 n_res = _PyUnicode_ToUpperFull(c, mapped);
9555 }
9556 else {
9557 n_res = 1;
9558 mapped[0] = c;
9559 }
9560 for (j = 0; j < n_res; j++) {
9561 if (mapped[j] > *maxchar)
9562 *maxchar = mapped[j];
9563 res[k++] = mapped[j];
9564 }
9565 }
9566 return k;
9567}
9568
9569static Py_ssize_t
9570do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9571 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009573 Py_ssize_t i, k = 0;
9574
9575 for (i = 0; i < length; i++) {
9576 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9577 int n_res, j;
9578 if (lower)
9579 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9580 else
9581 n_res = _PyUnicode_ToUpperFull(c, mapped);
9582 for (j = 0; j < n_res; j++) {
9583 if (mapped[j] > *maxchar)
9584 *maxchar = mapped[j];
9585 res[k++] = mapped[j];
9586 }
9587 }
9588 return k;
9589}
9590
9591static Py_ssize_t
9592do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9593{
9594 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9595}
9596
9597static Py_ssize_t
9598do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9599{
9600 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9601}
9602
Benjamin Petersone51757f2012-01-12 21:10:29 -05009603static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009604do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9605{
9606 Py_ssize_t i, k = 0;
9607
9608 for (i = 0; i < length; i++) {
9609 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9610 Py_UCS4 mapped[3];
9611 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9612 for (j = 0; j < n_res; j++) {
9613 if (mapped[j] > *maxchar)
9614 *maxchar = mapped[j];
9615 res[k++] = mapped[j];
9616 }
9617 }
9618 return k;
9619}
9620
9621static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009622do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9623{
9624 Py_ssize_t i, k = 0;
9625 int previous_is_cased;
9626
9627 previous_is_cased = 0;
9628 for (i = 0; i < length; i++) {
9629 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9630 Py_UCS4 mapped[3];
9631 int n_res, j;
9632
9633 if (previous_is_cased)
9634 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9635 else
9636 n_res = _PyUnicode_ToTitleFull(c, mapped);
9637
9638 for (j = 0; j < n_res; j++) {
9639 if (mapped[j] > *maxchar)
9640 *maxchar = mapped[j];
9641 res[k++] = mapped[j];
9642 }
9643
9644 previous_is_cased = _PyUnicode_IsCased(c);
9645 }
9646 return k;
9647}
9648
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009649static PyObject *
9650case_operation(PyObject *self,
9651 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9652{
9653 PyObject *res = NULL;
9654 Py_ssize_t length, newlength = 0;
9655 int kind, outkind;
9656 void *data, *outdata;
9657 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9658
Benjamin Petersoneea48462012-01-16 14:28:50 -05009659 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660
9661 kind = PyUnicode_KIND(self);
9662 data = PyUnicode_DATA(self);
9663 length = PyUnicode_GET_LENGTH(self);
9664 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9665 if (tmp == NULL)
9666 return PyErr_NoMemory();
9667 newlength = perform(kind, data, length, tmp, &maxchar);
9668 res = PyUnicode_New(newlength, maxchar);
9669 if (res == NULL)
9670 goto leave;
9671 tmpend = tmp + newlength;
9672 outdata = PyUnicode_DATA(res);
9673 outkind = PyUnicode_KIND(res);
9674 switch (outkind) {
9675 case PyUnicode_1BYTE_KIND:
9676 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9677 break;
9678 case PyUnicode_2BYTE_KIND:
9679 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9680 break;
9681 case PyUnicode_4BYTE_KIND:
9682 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9683 break;
9684 default:
9685 assert(0);
9686 break;
9687 }
9688 leave:
9689 PyMem_FREE(tmp);
9690 return res;
9691}
9692
Tim Peters8ce9f162004-08-27 01:49:32 +00009693PyObject *
9694PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009697 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009699 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009700 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9701 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009702 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009704 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009706 int use_memcpy;
9707 unsigned char *res_data = NULL, *sep_data = NULL;
9708 PyObject *last_obj;
9709 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710
Tim Peters05eba1f2004-08-27 21:32:02 +00009711 fseq = PySequence_Fast(seq, "");
9712 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009713 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009714 }
9715
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009716 /* NOTE: the following code can't call back into Python code,
9717 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009718 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009719
Tim Peters05eba1f2004-08-27 21:32:02 +00009720 seqlen = PySequence_Fast_GET_SIZE(fseq);
9721 /* If empty sequence, return u"". */
9722 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009723 Py_DECREF(fseq);
9724 Py_INCREF(unicode_empty);
9725 res = unicode_empty;
9726 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009727 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009728
Tim Peters05eba1f2004-08-27 21:32:02 +00009729 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009730 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009731 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009732 if (seqlen == 1) {
9733 if (PyUnicode_CheckExact(items[0])) {
9734 res = items[0];
9735 Py_INCREF(res);
9736 Py_DECREF(fseq);
9737 return res;
9738 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009739 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009740 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009741 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009742 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009743 /* Set up sep and seplen */
9744 if (separator == NULL) {
9745 /* fall back to a blank space separator */
9746 sep = PyUnicode_FromOrdinal(' ');
9747 if (!sep)
9748 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009749 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009750 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009751 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009752 else {
9753 if (!PyUnicode_Check(separator)) {
9754 PyErr_Format(PyExc_TypeError,
9755 "separator: expected str instance,"
9756 " %.80s found",
9757 Py_TYPE(separator)->tp_name);
9758 goto onError;
9759 }
9760 if (PyUnicode_READY(separator))
9761 goto onError;
9762 sep = separator;
9763 seplen = PyUnicode_GET_LENGTH(separator);
9764 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9765 /* inc refcount to keep this code path symmetric with the
9766 above case of a blank separator */
9767 Py_INCREF(sep);
9768 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009769 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009770 }
9771
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009772 /* There are at least two things to join, or else we have a subclass
9773 * of str in the sequence.
9774 * Do a pre-pass to figure out the total amount of space we'll
9775 * need (sz), and see whether all argument are strings.
9776 */
9777 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009778#ifdef Py_DEBUG
9779 use_memcpy = 0;
9780#else
9781 use_memcpy = 1;
9782#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009783 for (i = 0; i < seqlen; i++) {
9784 const Py_ssize_t old_sz = sz;
9785 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009786 if (!PyUnicode_Check(item)) {
9787 PyErr_Format(PyExc_TypeError,
9788 "sequence item %zd: expected str instance,"
9789 " %.80s found",
9790 i, Py_TYPE(item)->tp_name);
9791 goto onError;
9792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 if (PyUnicode_READY(item) == -1)
9794 goto onError;
9795 sz += PyUnicode_GET_LENGTH(item);
9796 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009797 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009798 if (i != 0)
9799 sz += seplen;
9800 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9801 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009802 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009803 goto onError;
9804 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009805 if (use_memcpy && last_obj != NULL) {
9806 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9807 use_memcpy = 0;
9808 }
9809 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009810 }
Tim Petersced69f82003-09-16 20:30:58 +00009811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009813 if (res == NULL)
9814 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009815
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009816 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009817#ifdef Py_DEBUG
9818 use_memcpy = 0;
9819#else
9820 if (use_memcpy) {
9821 res_data = PyUnicode_1BYTE_DATA(res);
9822 kind = PyUnicode_KIND(res);
9823 if (seplen != 0)
9824 sep_data = PyUnicode_1BYTE_DATA(sep);
9825 }
9826#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009828 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009829 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009830 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009831 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009832 if (use_memcpy) {
9833 Py_MEMCPY(res_data,
9834 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009835 kind * seplen);
9836 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009837 }
9838 else {
9839 copy_characters(res, res_offset, sep, 0, seplen);
9840 res_offset += seplen;
9841 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009842 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009843 itemlen = PyUnicode_GET_LENGTH(item);
9844 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009845 if (use_memcpy) {
9846 Py_MEMCPY(res_data,
9847 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009848 kind * itemlen);
9849 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009850 }
9851 else {
9852 copy_characters(res, res_offset, item, 0, itemlen);
9853 res_offset += itemlen;
9854 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009855 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009856 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009857 if (use_memcpy)
9858 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009859 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009860 else
9861 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009862
Tim Peters05eba1f2004-08-27 21:32:02 +00009863 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009865 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867
Benjamin Peterson29060642009-01-31 22:14:21 +00009868 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009869 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009871 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872 return NULL;
9873}
9874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875#define FILL(kind, data, value, start, length) \
9876 do { \
9877 Py_ssize_t i_ = 0; \
9878 assert(kind != PyUnicode_WCHAR_KIND); \
9879 switch ((kind)) { \
9880 case PyUnicode_1BYTE_KIND: { \
9881 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9882 memset(to_, (unsigned char)value, length); \
9883 break; \
9884 } \
9885 case PyUnicode_2BYTE_KIND: { \
9886 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9887 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9888 break; \
9889 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009890 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9892 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9893 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009894 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 } \
9896 } \
9897 } while (0)
9898
Victor Stinner3fe55312012-01-04 00:33:50 +01009899Py_ssize_t
9900PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9901 Py_UCS4 fill_char)
9902{
9903 Py_ssize_t maxlen;
9904 enum PyUnicode_Kind kind;
9905 void *data;
9906
9907 if (!PyUnicode_Check(unicode)) {
9908 PyErr_BadInternalCall();
9909 return -1;
9910 }
9911 if (PyUnicode_READY(unicode) == -1)
9912 return -1;
9913 if (unicode_check_modifiable(unicode))
9914 return -1;
9915
9916 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9917 PyErr_SetString(PyExc_ValueError,
9918 "fill character is bigger than "
9919 "the string maximum character");
9920 return -1;
9921 }
9922
9923 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9924 length = Py_MIN(maxlen, length);
9925 if (length <= 0)
9926 return 0;
9927
9928 kind = PyUnicode_KIND(unicode);
9929 data = PyUnicode_DATA(unicode);
9930 FILL(kind, data, fill_char, start, length);
9931 return length;
9932}
9933
Victor Stinner9310abb2011-10-05 00:59:23 +02009934static PyObject *
9935pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009936 Py_ssize_t left,
9937 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 PyObject *u;
9941 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009942 int kind;
9943 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944
9945 if (left < 0)
9946 left = 0;
9947 if (right < 0)
9948 right = 0;
9949
Victor Stinnerc4b49542011-12-11 22:44:26 +01009950 if (left == 0 && right == 0)
9951 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9954 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009955 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9956 return NULL;
9957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9959 if (fill > maxchar)
9960 maxchar = fill;
9961 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009962 if (!u)
9963 return NULL;
9964
9965 kind = PyUnicode_KIND(u);
9966 data = PyUnicode_DATA(u);
9967 if (left)
9968 FILL(kind, data, fill, 0, left);
9969 if (right)
9970 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009971 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009972 assert(_PyUnicode_CheckConsistency(u, 1));
9973 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976
Alexander Belopolsky40018472011-02-26 01:02:56 +00009977PyObject *
9978PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981
9982 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009983 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009984 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009985 if (PyUnicode_READY(string) == -1) {
9986 Py_DECREF(string);
9987 return NULL;
9988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989
Benjamin Petersonead6b532011-12-20 17:23:42 -06009990 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009992 if (PyUnicode_IS_ASCII(string))
9993 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009994 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009995 PyUnicode_GET_LENGTH(string), keepends);
9996 else
9997 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009998 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009999 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 break;
10001 case PyUnicode_2BYTE_KIND:
10002 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010003 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 PyUnicode_GET_LENGTH(string), keepends);
10005 break;
10006 case PyUnicode_4BYTE_KIND:
10007 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010008 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 PyUnicode_GET_LENGTH(string), keepends);
10010 break;
10011 default:
10012 assert(0);
10013 list = 0;
10014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015 Py_DECREF(string);
10016 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017}
10018
Alexander Belopolsky40018472011-02-26 01:02:56 +000010019static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010020split(PyObject *self,
10021 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010022 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 int kind1, kind2, kind;
10025 void *buf1, *buf2;
10026 Py_ssize_t len1, len2;
10027 PyObject* out;
10028
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010030 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 if (PyUnicode_READY(self) == -1)
10033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010036 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010038 if (PyUnicode_IS_ASCII(self))
10039 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010040 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010041 PyUnicode_GET_LENGTH(self), maxcount
10042 );
10043 else
10044 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010045 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010046 PyUnicode_GET_LENGTH(self), maxcount
10047 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 case PyUnicode_2BYTE_KIND:
10049 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010050 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 PyUnicode_GET_LENGTH(self), maxcount
10052 );
10053 case PyUnicode_4BYTE_KIND:
10054 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010055 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 PyUnicode_GET_LENGTH(self), maxcount
10057 );
10058 default:
10059 assert(0);
10060 return NULL;
10061 }
10062
10063 if (PyUnicode_READY(substring) == -1)
10064 return NULL;
10065
10066 kind1 = PyUnicode_KIND(self);
10067 kind2 = PyUnicode_KIND(substring);
10068 kind = kind1 > kind2 ? kind1 : kind2;
10069 buf1 = PyUnicode_DATA(self);
10070 buf2 = PyUnicode_DATA(substring);
10071 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010072 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 if (!buf1)
10074 return NULL;
10075 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010076 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 if (!buf2) {
10078 if (kind1 != kind) PyMem_Free(buf1);
10079 return NULL;
10080 }
10081 len1 = PyUnicode_GET_LENGTH(self);
10082 len2 = PyUnicode_GET_LENGTH(substring);
10083
Benjamin Petersonead6b532011-12-20 17:23:42 -060010084 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010086 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10087 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010088 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010089 else
10090 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010091 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 break;
10093 case PyUnicode_2BYTE_KIND:
10094 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010095 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 break;
10097 case PyUnicode_4BYTE_KIND:
10098 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010099 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 break;
10101 default:
10102 out = NULL;
10103 }
10104 if (kind1 != kind)
10105 PyMem_Free(buf1);
10106 if (kind2 != kind)
10107 PyMem_Free(buf2);
10108 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109}
10110
Alexander Belopolsky40018472011-02-26 01:02:56 +000010111static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010112rsplit(PyObject *self,
10113 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010114 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010115{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 int kind1, kind2, kind;
10117 void *buf1, *buf2;
10118 Py_ssize_t len1, len2;
10119 PyObject* out;
10120
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010121 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010122 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 if (PyUnicode_READY(self) == -1)
10125 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010128 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010130 if (PyUnicode_IS_ASCII(self))
10131 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010132 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010133 PyUnicode_GET_LENGTH(self), maxcount
10134 );
10135 else
10136 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010137 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010138 PyUnicode_GET_LENGTH(self), maxcount
10139 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 case PyUnicode_2BYTE_KIND:
10141 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010142 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 PyUnicode_GET_LENGTH(self), maxcount
10144 );
10145 case PyUnicode_4BYTE_KIND:
10146 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010147 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 PyUnicode_GET_LENGTH(self), maxcount
10149 );
10150 default:
10151 assert(0);
10152 return NULL;
10153 }
10154
10155 if (PyUnicode_READY(substring) == -1)
10156 return NULL;
10157
10158 kind1 = PyUnicode_KIND(self);
10159 kind2 = PyUnicode_KIND(substring);
10160 kind = kind1 > kind2 ? kind1 : kind2;
10161 buf1 = PyUnicode_DATA(self);
10162 buf2 = PyUnicode_DATA(substring);
10163 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010164 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 if (!buf1)
10166 return NULL;
10167 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010168 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 if (!buf2) {
10170 if (kind1 != kind) PyMem_Free(buf1);
10171 return NULL;
10172 }
10173 len1 = PyUnicode_GET_LENGTH(self);
10174 len2 = PyUnicode_GET_LENGTH(substring);
10175
Benjamin Petersonead6b532011-12-20 17:23:42 -060010176 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010178 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10179 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010180 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010181 else
10182 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010183 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 break;
10185 case PyUnicode_2BYTE_KIND:
10186 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010187 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 break;
10189 case PyUnicode_4BYTE_KIND:
10190 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010191 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 break;
10193 default:
10194 out = NULL;
10195 }
10196 if (kind1 != kind)
10197 PyMem_Free(buf1);
10198 if (kind2 != kind)
10199 PyMem_Free(buf2);
10200 return out;
10201}
10202
10203static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10205 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010207 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010209 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10210 return asciilib_find(buf1, len1, buf2, len2, offset);
10211 else
10212 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 case PyUnicode_2BYTE_KIND:
10214 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10215 case PyUnicode_4BYTE_KIND:
10216 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10217 }
10218 assert(0);
10219 return -1;
10220}
10221
10222static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010223anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10224 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010226 switch (kind) {
10227 case PyUnicode_1BYTE_KIND:
10228 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10229 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10230 else
10231 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10232 case PyUnicode_2BYTE_KIND:
10233 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10234 case PyUnicode_4BYTE_KIND:
10235 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10236 }
10237 assert(0);
10238 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010239}
10240
Alexander Belopolsky40018472011-02-26 01:02:56 +000010241static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242replace(PyObject *self, PyObject *str1,
10243 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 PyObject *u;
10246 char *sbuf = PyUnicode_DATA(self);
10247 char *buf1 = PyUnicode_DATA(str1);
10248 char *buf2 = PyUnicode_DATA(str2);
10249 int srelease = 0, release1 = 0, release2 = 0;
10250 int skind = PyUnicode_KIND(self);
10251 int kind1 = PyUnicode_KIND(str1);
10252 int kind2 = PyUnicode_KIND(str2);
10253 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10254 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10255 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010256 int mayshrink;
10257 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258
10259 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010262 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263
Victor Stinner59de0ee2011-10-07 10:01:28 +020010264 if (str1 == str2)
10265 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 if (skind < kind1)
10267 /* substring too wide to be present */
10268 goto nothing;
10269
Victor Stinner49a0a212011-10-12 23:46:10 +020010270 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10271 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10272 /* Replacing str1 with str2 may cause a maxchar reduction in the
10273 result string. */
10274 mayshrink = (maxchar_str2 < maxchar);
10275 maxchar = Py_MAX(maxchar, maxchar_str2);
10276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010278 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010280 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010282 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010283 Py_UCS4 u1, u2;
10284 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010285 Py_ssize_t index, pos;
10286 char *src;
10287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010289 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10290 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010291 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010296 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010298
10299 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10300 index = 0;
10301 src = sbuf;
10302 while (--maxcount)
10303 {
10304 pos++;
10305 src += pos * PyUnicode_KIND(self);
10306 slen -= pos;
10307 index += pos;
10308 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10309 if (pos < 0)
10310 break;
10311 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10312 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010313 }
10314 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 int rkind = skind;
10316 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010317 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 if (kind1 < rkind) {
10320 /* widen substring */
10321 buf1 = _PyUnicode_AsKind(str1, rkind);
10322 if (!buf1) goto error;
10323 release1 = 1;
10324 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010325 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010326 if (i < 0)
10327 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 if (rkind > kind2) {
10329 /* widen replacement */
10330 buf2 = _PyUnicode_AsKind(str2, rkind);
10331 if (!buf2) goto error;
10332 release2 = 1;
10333 }
10334 else if (rkind < kind2) {
10335 /* widen self and buf1 */
10336 rkind = kind2;
10337 if (release1) PyMem_Free(buf1);
10338 sbuf = _PyUnicode_AsKind(self, rkind);
10339 if (!sbuf) goto error;
10340 srelease = 1;
10341 buf1 = _PyUnicode_AsKind(str1, rkind);
10342 if (!buf1) goto error;
10343 release1 = 1;
10344 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010345 u = PyUnicode_New(slen, maxchar);
10346 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010348 assert(PyUnicode_KIND(u) == rkind);
10349 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010350
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010351 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010352 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010355 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010357
10358 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010359 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010360 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010362 if (i == -1)
10363 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010364 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010366 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010370 }
10371 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 Py_ssize_t n, i, j, ires;
10373 Py_ssize_t product, new_size;
10374 int rkind = skind;
10375 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010378 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 buf1 = _PyUnicode_AsKind(str1, rkind);
10380 if (!buf1) goto error;
10381 release1 = 1;
10382 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010383 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010384 if (n == 0)
10385 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010387 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 buf2 = _PyUnicode_AsKind(str2, rkind);
10389 if (!buf2) goto error;
10390 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010393 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 rkind = kind2;
10395 sbuf = _PyUnicode_AsKind(self, rkind);
10396 if (!sbuf) goto error;
10397 srelease = 1;
10398 if (release1) PyMem_Free(buf1);
10399 buf1 = _PyUnicode_AsKind(str1, rkind);
10400 if (!buf1) goto error;
10401 release1 = 1;
10402 }
10403 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10404 PyUnicode_GET_LENGTH(str1))); */
10405 product = n * (len2-len1);
10406 if ((product / (len2-len1)) != n) {
10407 PyErr_SetString(PyExc_OverflowError,
10408 "replace string is too long");
10409 goto error;
10410 }
10411 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010412 if (new_size == 0) {
10413 Py_INCREF(unicode_empty);
10414 u = unicode_empty;
10415 goto done;
10416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10418 PyErr_SetString(PyExc_OverflowError,
10419 "replace string is too long");
10420 goto error;
10421 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010422 u = PyUnicode_New(new_size, maxchar);
10423 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010425 assert(PyUnicode_KIND(u) == rkind);
10426 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 ires = i = 0;
10428 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010429 while (n-- > 0) {
10430 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010431 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010432 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010433 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010434 if (j == -1)
10435 break;
10436 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010437 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010438 memcpy(res + rkind * ires,
10439 sbuf + rkind * i,
10440 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010442 }
10443 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010445 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010447 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010453 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010454 memcpy(res + rkind * ires,
10455 sbuf + rkind * i,
10456 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010457 }
10458 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010459 /* interleave */
10460 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010461 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010463 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 if (--n <= 0)
10466 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010467 memcpy(res + rkind * ires,
10468 sbuf + rkind * i,
10469 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 ires++;
10471 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010472 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010473 memcpy(res + rkind * ires,
10474 sbuf + rkind * i,
10475 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010477 }
10478
10479 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010480 unicode_adjust_maxchar(&u);
10481 if (u == NULL)
10482 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010484
10485 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (srelease)
10487 PyMem_FREE(sbuf);
10488 if (release1)
10489 PyMem_FREE(buf1);
10490 if (release2)
10491 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010492 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010494
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010496 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (srelease)
10498 PyMem_FREE(sbuf);
10499 if (release1)
10500 PyMem_FREE(buf1);
10501 if (release2)
10502 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010503 return unicode_result_unchanged(self);
10504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 error:
10506 if (srelease && sbuf)
10507 PyMem_FREE(sbuf);
10508 if (release1 && buf1)
10509 PyMem_FREE(buf1);
10510 if (release2 && buf2)
10511 PyMem_FREE(buf2);
10512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513}
10514
10515/* --- Unicode Object Methods --------------------------------------------- */
10516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010517PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010518 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519\n\
10520Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010521characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522
10523static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010524unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010526 if (PyUnicode_READY(self) == -1)
10527 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010528 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529}
10530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010531PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533\n\
10534Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010535have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536
10537static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010538unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010540 if (PyUnicode_READY(self) == -1)
10541 return NULL;
10542 if (PyUnicode_GET_LENGTH(self) == 0)
10543 return unicode_result_unchanged(self);
10544 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545}
10546
Benjamin Petersond5890c82012-01-14 13:23:30 -050010547PyDoc_STRVAR(casefold__doc__,
10548 "S.casefold() -> str\n\
10549\n\
10550Return a version of S suitable for caseless comparisons.");
10551
10552static PyObject *
10553unicode_casefold(PyObject *self)
10554{
10555 if (PyUnicode_READY(self) == -1)
10556 return NULL;
10557 if (PyUnicode_IS_ASCII(self))
10558 return ascii_upper_or_lower(self, 1);
10559 return case_operation(self, do_casefold);
10560}
10561
10562
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010563/* Argument converter. Coerces to a single unicode character */
10564
10565static int
10566convert_uc(PyObject *obj, void *addr)
10567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010570
Benjamin Peterson14339b62009-01-31 16:36:08 +000010571 uniobj = PyUnicode_FromObject(obj);
10572 if (uniobj == NULL) {
10573 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010575 return 0;
10576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010578 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580 Py_DECREF(uniobj);
10581 return 0;
10582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010584 Py_DECREF(uniobj);
10585 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010586}
10587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010588PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010589 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010591Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010592done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593
10594static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010595unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010597 Py_ssize_t marg, left;
10598 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 Py_UCS4 fillchar = ' ';
10600
Victor Stinnere9a29352011-10-01 02:14:59 +020010601 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
Benjamin Petersonbac79492012-01-14 13:34:47 -050010604 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605 return NULL;
10606
Victor Stinnerc4b49542011-12-11 22:44:26 +010010607 if (PyUnicode_GET_LENGTH(self) >= width)
10608 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609
Victor Stinnerc4b49542011-12-11 22:44:26 +010010610 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 left = marg / 2 + (marg & width & 1);
10612
Victor Stinner9310abb2011-10-05 00:59:23 +020010613 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614}
10615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616/* This function assumes that str1 and str2 are readied by the caller. */
10617
Marc-André Lemburge5034372000-08-08 08:04:29 +000010618static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010619unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 int kind1, kind2;
10622 void *data1, *data2;
10623 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 kind1 = PyUnicode_KIND(str1);
10626 kind2 = PyUnicode_KIND(str2);
10627 data1 = PyUnicode_DATA(str1);
10628 data2 = PyUnicode_DATA(str2);
10629 len1 = PyUnicode_GET_LENGTH(str1);
10630 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 for (i = 0; i < len1 && i < len2; ++i) {
10633 Py_UCS4 c1, c2;
10634 c1 = PyUnicode_READ(kind1, data1, i);
10635 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010636
10637 if (c1 != c2)
10638 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010639 }
10640
10641 return (len1 < len2) ? -1 : (len1 != len2);
10642}
10643
Alexander Belopolsky40018472011-02-26 01:02:56 +000010644int
10645PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10648 if (PyUnicode_READY(left) == -1 ||
10649 PyUnicode_READY(right) == -1)
10650 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010651 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010653 PyErr_Format(PyExc_TypeError,
10654 "Can't compare %.100s and %.100s",
10655 left->ob_type->tp_name,
10656 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 return -1;
10658}
10659
Martin v. Löwis5b222132007-06-10 09:51:05 +000010660int
10661PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 Py_ssize_t i;
10664 int kind;
10665 void *data;
10666 Py_UCS4 chr;
10667
Victor Stinner910337b2011-10-03 03:20:16 +020010668 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 if (PyUnicode_READY(uni) == -1)
10670 return -1;
10671 kind = PyUnicode_KIND(uni);
10672 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010673 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10675 if (chr != str[i])
10676 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010677 /* This check keeps Python strings that end in '\0' from comparing equal
10678 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010680 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010681 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010682 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010683 return 0;
10684}
10685
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010686
Benjamin Peterson29060642009-01-31 22:14:21 +000010687#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010688 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010689
Alexander Belopolsky40018472011-02-26 01:02:56 +000010690PyObject *
10691PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010692{
10693 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010694
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010695 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10696 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (PyUnicode_READY(left) == -1 ||
10698 PyUnicode_READY(right) == -1)
10699 return NULL;
10700 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10701 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010702 if (op == Py_EQ) {
10703 Py_INCREF(Py_False);
10704 return Py_False;
10705 }
10706 if (op == Py_NE) {
10707 Py_INCREF(Py_True);
10708 return Py_True;
10709 }
10710 }
10711 if (left == right)
10712 result = 0;
10713 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010714 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010715
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010716 /* Convert the return value to a Boolean */
10717 switch (op) {
10718 case Py_EQ:
10719 v = TEST_COND(result == 0);
10720 break;
10721 case Py_NE:
10722 v = TEST_COND(result != 0);
10723 break;
10724 case Py_LE:
10725 v = TEST_COND(result <= 0);
10726 break;
10727 case Py_GE:
10728 v = TEST_COND(result >= 0);
10729 break;
10730 case Py_LT:
10731 v = TEST_COND(result == -1);
10732 break;
10733 case Py_GT:
10734 v = TEST_COND(result == 1);
10735 break;
10736 default:
10737 PyErr_BadArgument();
10738 return NULL;
10739 }
10740 Py_INCREF(v);
10741 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010743
Brian Curtindfc80e32011-08-10 20:28:54 -050010744 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010745}
10746
Alexander Belopolsky40018472011-02-26 01:02:56 +000010747int
10748PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010749{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010750 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 int kind1, kind2, kind;
10752 void *buf1, *buf2;
10753 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010754 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010755
10756 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010757 sub = PyUnicode_FromObject(element);
10758 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 PyErr_Format(PyExc_TypeError,
10760 "'in <string>' requires string as left operand, not %s",
10761 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010762 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010763 }
10764
Thomas Wouters477c8d52006-05-27 19:21:47 +000010765 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010766 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010767 Py_DECREF(sub);
10768 return -1;
10769 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010770 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10771 Py_DECREF(sub);
10772 Py_DECREF(str);
10773 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 kind1 = PyUnicode_KIND(str);
10776 kind2 = PyUnicode_KIND(sub);
10777 kind = kind1 > kind2 ? kind1 : kind2;
10778 buf1 = PyUnicode_DATA(str);
10779 buf2 = PyUnicode_DATA(sub);
10780 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010781 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (!buf1) {
10783 Py_DECREF(sub);
10784 return -1;
10785 }
10786 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010787 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 if (!buf2) {
10789 Py_DECREF(sub);
10790 if (kind1 != kind) PyMem_Free(buf1);
10791 return -1;
10792 }
10793 len1 = PyUnicode_GET_LENGTH(str);
10794 len2 = PyUnicode_GET_LENGTH(sub);
10795
Benjamin Petersonead6b532011-12-20 17:23:42 -060010796 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 case PyUnicode_1BYTE_KIND:
10798 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10799 break;
10800 case PyUnicode_2BYTE_KIND:
10801 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10802 break;
10803 case PyUnicode_4BYTE_KIND:
10804 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10805 break;
10806 default:
10807 result = -1;
10808 assert(0);
10809 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010810
10811 Py_DECREF(str);
10812 Py_DECREF(sub);
10813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 if (kind1 != kind)
10815 PyMem_Free(buf1);
10816 if (kind2 != kind)
10817 PyMem_Free(buf2);
10818
Guido van Rossum403d68b2000-03-13 15:55:09 +000010819 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010820}
10821
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822/* Concat to string or Unicode object giving a new Unicode object. */
10823
Alexander Belopolsky40018472011-02-26 01:02:56 +000010824PyObject *
10825PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010828 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010829 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
10831 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010834 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010837 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838
10839 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010840 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010841 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010844 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010845 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847 }
10848
Victor Stinner488fa492011-12-12 00:01:39 +010010849 u_len = PyUnicode_GET_LENGTH(u);
10850 v_len = PyUnicode_GET_LENGTH(v);
10851 if (u_len > PY_SSIZE_T_MAX - v_len) {
10852 PyErr_SetString(PyExc_OverflowError,
10853 "strings are too large to concat");
10854 goto onError;
10855 }
10856 new_len = u_len + v_len;
10857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010859 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10860 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010863 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010865 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010866 copy_characters(w, 0, u, 0, u_len);
10867 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868 Py_DECREF(u);
10869 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010870 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872
Benjamin Peterson29060642009-01-31 22:14:21 +000010873 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874 Py_XDECREF(u);
10875 Py_XDECREF(v);
10876 return NULL;
10877}
10878
Walter Dörwald1ab83302007-05-18 17:15:44 +000010879void
Victor Stinner23e56682011-10-03 03:54:37 +020010880PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010881{
Victor Stinner23e56682011-10-03 03:54:37 +020010882 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010883 Py_UCS4 maxchar, maxchar2;
10884 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010885
10886 if (p_left == NULL) {
10887 if (!PyErr_Occurred())
10888 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010889 return;
10890 }
Victor Stinner23e56682011-10-03 03:54:37 +020010891 left = *p_left;
10892 if (right == NULL || !PyUnicode_Check(left)) {
10893 if (!PyErr_Occurred())
10894 PyErr_BadInternalCall();
10895 goto error;
10896 }
10897
Benjamin Petersonbac79492012-01-14 13:34:47 -050010898 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010899 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010900 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010901 goto error;
10902
Victor Stinner488fa492011-12-12 00:01:39 +010010903 /* Shortcuts */
10904 if (left == unicode_empty) {
10905 Py_DECREF(left);
10906 Py_INCREF(right);
10907 *p_left = right;
10908 return;
10909 }
10910 if (right == unicode_empty)
10911 return;
10912
10913 left_len = PyUnicode_GET_LENGTH(left);
10914 right_len = PyUnicode_GET_LENGTH(right);
10915 if (left_len > PY_SSIZE_T_MAX - right_len) {
10916 PyErr_SetString(PyExc_OverflowError,
10917 "strings are too large to concat");
10918 goto error;
10919 }
10920 new_len = left_len + right_len;
10921
10922 if (unicode_modifiable(left)
10923 && PyUnicode_CheckExact(right)
10924 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010925 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10926 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010927 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010928 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010929 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10930 {
10931 /* append inplace */
10932 if (unicode_resize(p_left, new_len) != 0) {
10933 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10934 * deallocated so it cannot be put back into
10935 * 'variable'. The MemoryError is raised when there
10936 * is no value in 'variable', which might (very
10937 * remotely) be a cause of incompatibilities.
10938 */
10939 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010940 }
Victor Stinner488fa492011-12-12 00:01:39 +010010941 /* copy 'right' into the newly allocated area of 'left' */
10942 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010943 }
Victor Stinner488fa492011-12-12 00:01:39 +010010944 else {
10945 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10946 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10947 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010948
Victor Stinner488fa492011-12-12 00:01:39 +010010949 /* Concat the two Unicode strings */
10950 res = PyUnicode_New(new_len, maxchar);
10951 if (res == NULL)
10952 goto error;
10953 copy_characters(res, 0, left, 0, left_len);
10954 copy_characters(res, left_len, right, 0, right_len);
10955 Py_DECREF(left);
10956 *p_left = res;
10957 }
10958 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010959 return;
10960
10961error:
Victor Stinner488fa492011-12-12 00:01:39 +010010962 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010963}
10964
10965void
10966PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10967{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010968 PyUnicode_Append(pleft, right);
10969 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010970}
10971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010972PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010973 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010975Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010976string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010977interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978
10979static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010980unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010982 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010983 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010984 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 int kind1, kind2, kind;
10987 void *buf1, *buf2;
10988 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989
Jesus Ceaac451502011-04-20 17:09:23 +020010990 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10991 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 kind1 = PyUnicode_KIND(self);
10995 kind2 = PyUnicode_KIND(substring);
10996 kind = kind1 > kind2 ? kind1 : kind2;
10997 buf1 = PyUnicode_DATA(self);
10998 buf2 = PyUnicode_DATA(substring);
10999 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011000 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 if (!buf1) {
11002 Py_DECREF(substring);
11003 return NULL;
11004 }
11005 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011006 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 if (!buf2) {
11008 Py_DECREF(substring);
11009 if (kind1 != kind) PyMem_Free(buf1);
11010 return NULL;
11011 }
11012 len1 = PyUnicode_GET_LENGTH(self);
11013 len2 = PyUnicode_GET_LENGTH(substring);
11014
11015 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011016 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 case PyUnicode_1BYTE_KIND:
11018 iresult = ucs1lib_count(
11019 ((Py_UCS1*)buf1) + start, end - start,
11020 buf2, len2, PY_SSIZE_T_MAX
11021 );
11022 break;
11023 case PyUnicode_2BYTE_KIND:
11024 iresult = ucs2lib_count(
11025 ((Py_UCS2*)buf1) + start, end - start,
11026 buf2, len2, PY_SSIZE_T_MAX
11027 );
11028 break;
11029 case PyUnicode_4BYTE_KIND:
11030 iresult = ucs4lib_count(
11031 ((Py_UCS4*)buf1) + start, end - start,
11032 buf2, len2, PY_SSIZE_T_MAX
11033 );
11034 break;
11035 default:
11036 assert(0); iresult = 0;
11037 }
11038
11039 result = PyLong_FromSsize_t(iresult);
11040
11041 if (kind1 != kind)
11042 PyMem_Free(buf1);
11043 if (kind2 != kind)
11044 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
11046 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011047
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 return result;
11049}
11050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011051PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011052 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011054Encode S using the codec registered for encoding. Default encoding\n\
11055is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011056handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011057a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11058'xmlcharrefreplace' as well as any other name registered with\n\
11059codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060
11061static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011062unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011064 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 char *encoding = NULL;
11066 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011067
Benjamin Peterson308d6372009-09-18 21:42:35 +000011068 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11069 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011071 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011072}
11073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011074PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011075 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076\n\
11077Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011078If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079
11080static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011081unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011083 Py_ssize_t i, j, line_pos, src_len, incr;
11084 Py_UCS4 ch;
11085 PyObject *u;
11086 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011088 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011089 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090
11091 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093
Antoine Pitrou22425222011-10-04 19:10:51 +020011094 if (PyUnicode_READY(self) == -1)
11095 return NULL;
11096
Thomas Wouters7e474022000-07-16 12:04:32 +000011097 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011098 src_len = PyUnicode_GET_LENGTH(self);
11099 i = j = line_pos = 0;
11100 kind = PyUnicode_KIND(self);
11101 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011102 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011103 for (; i < src_len; i++) {
11104 ch = PyUnicode_READ(kind, src_data, i);
11105 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011106 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011107 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011108 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011110 goto overflow;
11111 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011112 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011113 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011114 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011116 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011117 goto overflow;
11118 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011120 if (ch == '\n' || ch == '\r')
11121 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011123 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011124 if (!found)
11125 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011126
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011128 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129 if (!u)
11130 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011131 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132
Antoine Pitroue71d5742011-10-04 15:55:09 +020011133 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
Antoine Pitroue71d5742011-10-04 15:55:09 +020011135 for (; i < src_len; i++) {
11136 ch = PyUnicode_READ(kind, src_data, i);
11137 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011138 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011139 incr = tabsize - (line_pos % tabsize);
11140 line_pos += incr;
11141 while (incr--) {
11142 PyUnicode_WRITE(kind, dest_data, j, ' ');
11143 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011144 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011146 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011148 line_pos++;
11149 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011150 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011151 if (ch == '\n' || ch == '\r')
11152 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011154 }
11155 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011156 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011157
Antoine Pitroue71d5742011-10-04 15:55:09 +020011158 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011159 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161}
11162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011163PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011164 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165\n\
11166Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011167such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168arguments start and end are interpreted as in slice notation.\n\
11169\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011170Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
11172static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011175 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011176 Py_ssize_t start;
11177 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011178 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179
Jesus Ceaac451502011-04-20 17:09:23 +020011180 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11181 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 if (PyUnicode_READY(self) == -1)
11185 return NULL;
11186 if (PyUnicode_READY(substring) == -1)
11187 return NULL;
11188
Victor Stinner7931d9a2011-11-04 00:22:48 +010011189 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190
11191 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 if (result == -2)
11194 return NULL;
11195
Christian Heimes217cfd12007-12-02 14:31:20 +000011196 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197}
11198
11199static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011200unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011202 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11203 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206}
11207
Guido van Rossumc2504932007-09-18 19:42:40 +000011208/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011209 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011210static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011211unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212{
Guido van Rossumc2504932007-09-18 19:42:40 +000011213 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011214 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 if (_PyUnicode_HASH(self) != -1)
11217 return _PyUnicode_HASH(self);
11218 if (PyUnicode_READY(self) == -1)
11219 return -1;
11220 len = PyUnicode_GET_LENGTH(self);
11221
11222 /* The hash function as a macro, gets expanded three times below. */
11223#define HASH(P) \
11224 x = (Py_uhash_t)*P << 7; \
11225 while (--len >= 0) \
Gregory P. Smithf5b62a92012-01-14 15:45:13 -080011226 x = (_PyHASH_MULTIPLIER*x) ^ (Py_uhash_t)*P++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227
11228 switch (PyUnicode_KIND(self)) {
11229 case PyUnicode_1BYTE_KIND: {
11230 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11231 HASH(c);
11232 break;
11233 }
11234 case PyUnicode_2BYTE_KIND: {
11235 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11236 HASH(s);
11237 break;
11238 }
11239 default: {
11240 Py_UCS4 *l;
11241 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11242 "Impossible switch case in unicode_hash");
11243 l = PyUnicode_4BYTE_DATA(self);
11244 HASH(l);
11245 break;
11246 }
11247 }
11248 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11249
Guido van Rossumc2504932007-09-18 19:42:40 +000011250 if (x == -1)
11251 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011253 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011257PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261
11262static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011265 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011266 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011267 Py_ssize_t start;
11268 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269
Jesus Ceaac451502011-04-20 17:09:23 +020011270 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11271 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274 if (PyUnicode_READY(self) == -1)
11275 return NULL;
11276 if (PyUnicode_READY(substring) == -1)
11277 return NULL;
11278
Victor Stinner7931d9a2011-11-04 00:22:48 +010011279 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
11281 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (result == -2)
11284 return NULL;
11285
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286 if (result < 0) {
11287 PyErr_SetString(PyExc_ValueError, "substring not found");
11288 return NULL;
11289 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011290
Christian Heimes217cfd12007-12-02 14:31:20 +000011291 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292}
11293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011294PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011297Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011298at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299
11300static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011301unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 Py_ssize_t i, length;
11304 int kind;
11305 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 int cased;
11307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 if (PyUnicode_READY(self) == -1)
11309 return NULL;
11310 length = PyUnicode_GET_LENGTH(self);
11311 kind = PyUnicode_KIND(self);
11312 data = PyUnicode_DATA(self);
11313
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 if (length == 1)
11316 return PyBool_FromLong(
11317 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011319 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011322
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 for (i = 0; i < length; i++) {
11325 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011326
Benjamin Peterson29060642009-01-31 22:14:21 +000011327 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11328 return PyBool_FromLong(0);
11329 else if (!cased && Py_UNICODE_ISLOWER(ch))
11330 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011332 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333}
11334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011335PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011336 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011338Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011339at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340
11341static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011342unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 Py_ssize_t i, length;
11345 int kind;
11346 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347 int cased;
11348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 if (PyUnicode_READY(self) == -1)
11350 return NULL;
11351 length = PyUnicode_GET_LENGTH(self);
11352 kind = PyUnicode_KIND(self);
11353 data = PyUnicode_DATA(self);
11354
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 if (length == 1)
11357 return PyBool_FromLong(
11358 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011360 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011363
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 for (i = 0; i < length; i++) {
11366 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011367
Benjamin Peterson29060642009-01-31 22:14:21 +000011368 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11369 return PyBool_FromLong(0);
11370 else if (!cased && Py_UNICODE_ISUPPER(ch))
11371 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011373 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374}
11375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011376PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011377 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011379Return True if S is a titlecased string and there is at least one\n\
11380character in S, i.e. upper- and titlecase characters may only\n\
11381follow uncased characters and lowercase characters only cased ones.\n\
11382Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383
11384static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011385unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 Py_ssize_t i, length;
11388 int kind;
11389 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390 int cased, previous_is_cased;
11391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 if (PyUnicode_READY(self) == -1)
11393 return NULL;
11394 length = PyUnicode_GET_LENGTH(self);
11395 kind = PyUnicode_KIND(self);
11396 data = PyUnicode_DATA(self);
11397
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 if (length == 1) {
11400 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11401 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11402 (Py_UNICODE_ISUPPER(ch) != 0));
11403 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011405 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011407 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011408
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 cased = 0;
11410 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 for (i = 0; i < length; i++) {
11412 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011413
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11415 if (previous_is_cased)
11416 return PyBool_FromLong(0);
11417 previous_is_cased = 1;
11418 cased = 1;
11419 }
11420 else if (Py_UNICODE_ISLOWER(ch)) {
11421 if (!previous_is_cased)
11422 return PyBool_FromLong(0);
11423 previous_is_cased = 1;
11424 cased = 1;
11425 }
11426 else
11427 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011429 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430}
11431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011432PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011435Return True if all characters in S are whitespace\n\
11436and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
11438static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011439unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 Py_ssize_t i, length;
11442 int kind;
11443 void *data;
11444
11445 if (PyUnicode_READY(self) == -1)
11446 return NULL;
11447 length = PyUnicode_GET_LENGTH(self);
11448 kind = PyUnicode_KIND(self);
11449 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 if (length == 1)
11453 return PyBool_FromLong(
11454 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011456 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 for (i = 0; i < length; i++) {
11461 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011462 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011463 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011465 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466}
11467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011468PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011470\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011471Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011472and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011473
11474static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011475unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 Py_ssize_t i, length;
11478 int kind;
11479 void *data;
11480
11481 if (PyUnicode_READY(self) == -1)
11482 return NULL;
11483 length = PyUnicode_GET_LENGTH(self);
11484 kind = PyUnicode_KIND(self);
11485 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011486
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011487 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 if (length == 1)
11489 return PyBool_FromLong(
11490 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011491
11492 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496 for (i = 0; i < length; i++) {
11497 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011499 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011500 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011501}
11502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011503PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011505\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011506Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011507and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011508
11509static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011510unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 int kind;
11513 void *data;
11514 Py_ssize_t len, i;
11515
11516 if (PyUnicode_READY(self) == -1)
11517 return NULL;
11518
11519 kind = PyUnicode_KIND(self);
11520 data = PyUnicode_DATA(self);
11521 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011522
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011523 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (len == 1) {
11525 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11526 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11527 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011528
11529 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 for (i = 0; i < len; i++) {
11534 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011535 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011536 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011537 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011538 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011539}
11540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011541PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011544Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011545False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
11547static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011548unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 Py_ssize_t i, length;
11551 int kind;
11552 void *data;
11553
11554 if (PyUnicode_READY(self) == -1)
11555 return NULL;
11556 length = PyUnicode_GET_LENGTH(self);
11557 kind = PyUnicode_KIND(self);
11558 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 if (length == 1)
11562 return PyBool_FromLong(
11563 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011565 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 for (i = 0; i < length; i++) {
11570 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011573 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574}
11575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011576PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011579Return True if all characters in S are digits\n\
11580and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
11582static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011583unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585 Py_ssize_t i, length;
11586 int kind;
11587 void *data;
11588
11589 if (PyUnicode_READY(self) == -1)
11590 return NULL;
11591 length = PyUnicode_GET_LENGTH(self);
11592 kind = PyUnicode_KIND(self);
11593 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 if (length == 1) {
11597 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11598 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011601 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 for (i = 0; i < length; i++) {
11606 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011609 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610}
11611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011615Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617
11618static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011619unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 Py_ssize_t i, length;
11622 int kind;
11623 void *data;
11624
11625 if (PyUnicode_READY(self) == -1)
11626 return NULL;
11627 length = PyUnicode_GET_LENGTH(self);
11628 kind = PyUnicode_KIND(self);
11629 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (length == 1)
11633 return PyBool_FromLong(
11634 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011636 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 for (i = 0; i < length; i++) {
11641 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011644 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645}
11646
Martin v. Löwis47383402007-08-15 07:32:56 +000011647int
11648PyUnicode_IsIdentifier(PyObject *self)
11649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 int kind;
11651 void *data;
11652 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011653 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 if (PyUnicode_READY(self) == -1) {
11656 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 }
11659
11660 /* Special case for empty strings */
11661 if (PyUnicode_GET_LENGTH(self) == 0)
11662 return 0;
11663 kind = PyUnicode_KIND(self);
11664 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011665
11666 /* PEP 3131 says that the first character must be in
11667 XID_Start and subsequent characters in XID_Continue,
11668 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011669 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011670 letters, digits, underscore). However, given the current
11671 definition of XID_Start and XID_Continue, it is sufficient
11672 to check just for these, except that _ must be allowed
11673 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011675 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011676 return 0;
11677
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011678 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011681 return 1;
11682}
11683
11684PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011686\n\
11687Return True if S is a valid identifier according\n\
11688to the language definition.");
11689
11690static PyObject*
11691unicode_isidentifier(PyObject *self)
11692{
11693 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11694}
11695
Georg Brandl559e5d72008-06-11 18:37:52 +000011696PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011698\n\
11699Return True if all characters in S are considered\n\
11700printable in repr() or S is empty, False otherwise.");
11701
11702static PyObject*
11703unicode_isprintable(PyObject *self)
11704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 Py_ssize_t i, length;
11706 int kind;
11707 void *data;
11708
11709 if (PyUnicode_READY(self) == -1)
11710 return NULL;
11711 length = PyUnicode_GET_LENGTH(self);
11712 kind = PyUnicode_KIND(self);
11713 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011714
11715 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 if (length == 1)
11717 return PyBool_FromLong(
11718 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 for (i = 0; i < length; i++) {
11721 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011722 Py_RETURN_FALSE;
11723 }
11724 }
11725 Py_RETURN_TRUE;
11726}
11727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011728PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011729 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730\n\
11731Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011732iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733
11734static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011735unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011737 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738}
11739
Martin v. Löwis18e16552006-02-15 17:27:45 +000011740static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011741unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 if (PyUnicode_READY(self) == -1)
11744 return -1;
11745 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746}
11747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011748PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011751Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011752done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
11754static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011755unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011757 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 Py_UCS4 fillchar = ' ';
11759
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011760 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 return NULL;
11762
Benjamin Petersonbac79492012-01-14 13:34:47 -050011763 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Victor Stinnerc4b49542011-12-11 22:44:26 +010011766 if (PyUnicode_GET_LENGTH(self) >= width)
11767 return unicode_result_unchanged(self);
11768
11769 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770}
11771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011772PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011775Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
11777static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011778unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011780 if (PyUnicode_READY(self) == -1)
11781 return NULL;
11782 if (PyUnicode_IS_ASCII(self))
11783 return ascii_upper_or_lower(self, 1);
11784 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785}
11786
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787#define LEFTSTRIP 0
11788#define RIGHTSTRIP 1
11789#define BOTHSTRIP 2
11790
11791/* Arrays indexed by above */
11792static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11793
11794#define STRIPNAME(i) (stripformat[i]+3)
11795
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011796/* externally visible for str.strip(unicode) */
11797PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011798_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 void *data;
11801 int kind;
11802 Py_ssize_t i, j, len;
11803 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11806 return NULL;
11807
11808 kind = PyUnicode_KIND(self);
11809 data = PyUnicode_DATA(self);
11810 len = PyUnicode_GET_LENGTH(self);
11811 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11812 PyUnicode_DATA(sepobj),
11813 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011814
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 i = 0;
11816 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 while (i < len &&
11818 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 i++;
11820 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011821 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011822
Benjamin Peterson14339b62009-01-31 16:36:08 +000011823 j = len;
11824 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 do {
11826 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 } while (j >= i &&
11828 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011830 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011831
Victor Stinner7931d9a2011-11-04 00:22:48 +010011832 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833}
11834
11835PyObject*
11836PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11837{
11838 unsigned char *data;
11839 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011840 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841
Victor Stinnerde636f32011-10-01 03:55:54 +020011842 if (PyUnicode_READY(self) == -1)
11843 return NULL;
11844
11845 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11846
Victor Stinner12bab6d2011-10-01 01:53:49 +020011847 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011848 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849
Victor Stinner12bab6d2011-10-01 01:53:49 +020011850 length = end - start;
11851 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011852 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853
Victor Stinnerde636f32011-10-01 03:55:54 +020011854 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011855 PyErr_SetString(PyExc_IndexError, "string index out of range");
11856 return NULL;
11857 }
11858
Victor Stinnerb9275c12011-10-05 14:01:42 +020011859 if (PyUnicode_IS_ASCII(self)) {
11860 kind = PyUnicode_KIND(self);
11861 data = PyUnicode_1BYTE_DATA(self);
11862 return unicode_fromascii(data + start, length);
11863 }
11864 else {
11865 kind = PyUnicode_KIND(self);
11866 data = PyUnicode_1BYTE_DATA(self);
11867 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011868 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011869 length);
11870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872
11873static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011874do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 int kind;
11877 void *data;
11878 Py_ssize_t len, i, j;
11879
11880 if (PyUnicode_READY(self) == -1)
11881 return NULL;
11882
11883 kind = PyUnicode_KIND(self);
11884 data = PyUnicode_DATA(self);
11885 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011886
Benjamin Peterson14339b62009-01-31 16:36:08 +000011887 i = 0;
11888 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011890 i++;
11891 }
11892 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011893
Benjamin Peterson14339b62009-01-31 16:36:08 +000011894 j = len;
11895 if (striptype != LEFTSTRIP) {
11896 do {
11897 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011899 j++;
11900 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011901
Victor Stinner7931d9a2011-11-04 00:22:48 +010011902 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903}
11904
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011905
11906static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011907do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011908{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011909 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011910
Benjamin Peterson14339b62009-01-31 16:36:08 +000011911 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11912 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011913
Benjamin Peterson14339b62009-01-31 16:36:08 +000011914 if (sep != NULL && sep != Py_None) {
11915 if (PyUnicode_Check(sep))
11916 return _PyUnicode_XStrip(self, striptype, sep);
11917 else {
11918 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 "%s arg must be None or str",
11920 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011921 return NULL;
11922 }
11923 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011924
Benjamin Peterson14339b62009-01-31 16:36:08 +000011925 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011926}
11927
11928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011929PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011931\n\
11932Return a copy of the string S with leading and trailing\n\
11933whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011934If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011935
11936static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011937unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011938{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011939 if (PyTuple_GET_SIZE(args) == 0)
11940 return do_strip(self, BOTHSTRIP); /* Common case */
11941 else
11942 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011943}
11944
11945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011946PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011948\n\
11949Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011950If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011951
11952static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011953unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011954{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011955 if (PyTuple_GET_SIZE(args) == 0)
11956 return do_strip(self, LEFTSTRIP); /* Common case */
11957 else
11958 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011959}
11960
11961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011962PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011964\n\
11965Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011966If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011967
11968static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011969unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011970{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011971 if (PyTuple_GET_SIZE(args) == 0)
11972 return do_strip(self, RIGHTSTRIP); /* Common case */
11973 else
11974 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011975}
11976
11977
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011979unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011981 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983
Georg Brandl222de0f2009-04-12 12:01:50 +000011984 if (len < 1) {
11985 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011986 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988
Victor Stinnerc4b49542011-12-11 22:44:26 +010011989 /* no repeat, return original string */
11990 if (len == 1)
11991 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011992
Benjamin Petersonbac79492012-01-14 13:34:47 -050011993 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 return NULL;
11995
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011996 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011997 PyErr_SetString(PyExc_OverflowError,
11998 "repeated string is too long");
11999 return NULL;
12000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012002
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012003 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004 if (!u)
12005 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012006 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (PyUnicode_GET_LENGTH(str) == 1) {
12009 const int kind = PyUnicode_KIND(str);
12010 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012011 if (kind == PyUnicode_1BYTE_KIND) {
12012 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012013 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012014 }
12015 else if (kind == PyUnicode_2BYTE_KIND) {
12016 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012017 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012018 ucs2[n] = fill_char;
12019 } else {
12020 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12021 assert(kind == PyUnicode_4BYTE_KIND);
12022 for (n = 0; n < len; ++n)
12023 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 }
12026 else {
12027 /* number of characters copied this far */
12028 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012029 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 char *to = (char *) PyUnicode_DATA(u);
12031 Py_MEMCPY(to, PyUnicode_DATA(str),
12032 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 n = (done <= nchars-done) ? done : nchars-done;
12035 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012036 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 }
12039
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012040 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012041 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042}
12043
Alexander Belopolsky40018472011-02-26 01:02:56 +000012044PyObject *
12045PyUnicode_Replace(PyObject *obj,
12046 PyObject *subobj,
12047 PyObject *replobj,
12048 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049{
12050 PyObject *self;
12051 PyObject *str1;
12052 PyObject *str2;
12053 PyObject *result;
12054
12055 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012056 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012059 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 Py_DECREF(self);
12061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 }
12063 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012064 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 Py_DECREF(self);
12066 Py_DECREF(str1);
12067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012069 if (PyUnicode_READY(self) == -1 ||
12070 PyUnicode_READY(str1) == -1 ||
12071 PyUnicode_READY(str2) == -1)
12072 result = NULL;
12073 else
12074 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 Py_DECREF(self);
12076 Py_DECREF(str1);
12077 Py_DECREF(str2);
12078 return result;
12079}
12080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012081PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012082 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083\n\
12084Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012085old replaced by new. If the optional argument count is\n\
12086given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
12088static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 PyObject *str1;
12092 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012093 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094 PyObject *result;
12095
Martin v. Löwis18e16552006-02-15 17:27:45 +000012096 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012098 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012101 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 return NULL;
12103 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012104 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012105 Py_DECREF(str1);
12106 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012107 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012108 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12109 result = NULL;
12110 else
12111 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
12113 Py_DECREF(str1);
12114 Py_DECREF(str2);
12115 return result;
12116}
12117
Alexander Belopolsky40018472011-02-26 01:02:56 +000012118static PyObject *
12119unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012121 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 Py_ssize_t isize;
12123 Py_ssize_t osize, squote, dquote, i, o;
12124 Py_UCS4 max, quote;
12125 int ikind, okind;
12126 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012129 return NULL;
12130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 isize = PyUnicode_GET_LENGTH(unicode);
12132 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 /* Compute length of output, quote characters, and
12135 maximum character */
12136 osize = 2; /* quotes */
12137 max = 127;
12138 squote = dquote = 0;
12139 ikind = PyUnicode_KIND(unicode);
12140 for (i = 0; i < isize; i++) {
12141 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12142 switch (ch) {
12143 case '\'': squote++; osize++; break;
12144 case '"': dquote++; osize++; break;
12145 case '\\': case '\t': case '\r': case '\n':
12146 osize += 2; break;
12147 default:
12148 /* Fast-path ASCII */
12149 if (ch < ' ' || ch == 0x7f)
12150 osize += 4; /* \xHH */
12151 else if (ch < 0x7f)
12152 osize++;
12153 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12154 osize++;
12155 max = ch > max ? ch : max;
12156 }
12157 else if (ch < 0x100)
12158 osize += 4; /* \xHH */
12159 else if (ch < 0x10000)
12160 osize += 6; /* \uHHHH */
12161 else
12162 osize += 10; /* \uHHHHHHHH */
12163 }
12164 }
12165
12166 quote = '\'';
12167 if (squote) {
12168 if (dquote)
12169 /* Both squote and dquote present. Use squote,
12170 and escape them */
12171 osize += squote;
12172 else
12173 quote = '"';
12174 }
12175
12176 repr = PyUnicode_New(osize, max);
12177 if (repr == NULL)
12178 return NULL;
12179 okind = PyUnicode_KIND(repr);
12180 odata = PyUnicode_DATA(repr);
12181
12182 PyUnicode_WRITE(okind, odata, 0, quote);
12183 PyUnicode_WRITE(okind, odata, osize-1, quote);
12184
12185 for (i = 0, o = 1; i < isize; i++) {
12186 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012187
12188 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 if ((ch == quote) || (ch == '\\')) {
12190 PyUnicode_WRITE(okind, odata, o++, '\\');
12191 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012192 continue;
12193 }
12194
Benjamin Peterson29060642009-01-31 22:14:21 +000012195 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012196 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 PyUnicode_WRITE(okind, odata, o++, '\\');
12198 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012199 }
12200 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 PyUnicode_WRITE(okind, odata, o++, '\\');
12202 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012203 }
12204 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 PyUnicode_WRITE(okind, odata, o++, '\\');
12206 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012207 }
12208
12209 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012210 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 PyUnicode_WRITE(okind, odata, o++, '\\');
12212 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012213 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12214 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012215 }
12216
Georg Brandl559e5d72008-06-11 18:37:52 +000012217 /* Copy ASCII characters as-is */
12218 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012220 }
12221
Benjamin Peterson29060642009-01-31 22:14:21 +000012222 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012223 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012224 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012225 (categories Z* and C* except ASCII space)
12226 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012228 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (ch <= 0xff) {
12230 PyUnicode_WRITE(okind, odata, o++, '\\');
12231 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012232 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12233 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012234 }
12235 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 else if (ch >= 0x10000) {
12237 PyUnicode_WRITE(okind, odata, o++, '\\');
12238 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012239 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12240 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12241 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12242 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12243 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12244 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12245 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12246 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012247 }
12248 /* Map 16-bit characters to '\uxxxx' */
12249 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 PyUnicode_WRITE(okind, odata, o++, '\\');
12251 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012252 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12253 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12254 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12255 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012256 }
12257 }
12258 /* Copy characters as-is */
12259 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012261 }
12262 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012265 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012266 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267}
12268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012269PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271\n\
12272Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012273such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274arguments start and end are interpreted as in slice notation.\n\
12275\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012276Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277
12278static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012281 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012282 Py_ssize_t start;
12283 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012284 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285
Jesus Ceaac451502011-04-20 17:09:23 +020012286 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12287 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 if (PyUnicode_READY(self) == -1)
12291 return NULL;
12292 if (PyUnicode_READY(substring) == -1)
12293 return NULL;
12294
Victor Stinner7931d9a2011-11-04 00:22:48 +010012295 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
12297 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 if (result == -2)
12300 return NULL;
12301
Christian Heimes217cfd12007-12-02 14:31:20 +000012302 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303}
12304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012305PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012308Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309
12310static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012313 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012314 Py_ssize_t start;
12315 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012316 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317
Jesus Ceaac451502011-04-20 17:09:23 +020012318 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12319 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 if (PyUnicode_READY(self) == -1)
12323 return NULL;
12324 if (PyUnicode_READY(substring) == -1)
12325 return NULL;
12326
Victor Stinner7931d9a2011-11-04 00:22:48 +010012327 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
12329 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 if (result == -2)
12332 return NULL;
12333
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334 if (result < 0) {
12335 PyErr_SetString(PyExc_ValueError, "substring not found");
12336 return NULL;
12337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338
Christian Heimes217cfd12007-12-02 14:31:20 +000012339 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340}
12341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012342PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012345Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012346done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347
12348static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012349unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012351 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 Py_UCS4 fillchar = ' ';
12353
Victor Stinnere9a29352011-10-01 02:14:59 +020012354 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012356
Benjamin Petersonbac79492012-01-14 13:34:47 -050012357 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358 return NULL;
12359
Victor Stinnerc4b49542011-12-11 22:44:26 +010012360 if (PyUnicode_GET_LENGTH(self) >= width)
12361 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
Victor Stinnerc4b49542011-12-11 22:44:26 +010012363 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364}
12365
Alexander Belopolsky40018472011-02-26 01:02:56 +000012366PyObject *
12367PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368{
12369 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012370
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371 s = PyUnicode_FromObject(s);
12372 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012373 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 if (sep != NULL) {
12375 sep = PyUnicode_FromObject(sep);
12376 if (sep == NULL) {
12377 Py_DECREF(s);
12378 return NULL;
12379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380 }
12381
Victor Stinner9310abb2011-10-05 00:59:23 +020012382 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383
12384 Py_DECREF(s);
12385 Py_XDECREF(sep);
12386 return result;
12387}
12388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012389PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391\n\
12392Return a list of the words in S, using sep as the\n\
12393delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012394splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012395whitespace string is a separator and empty strings are\n\
12396removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397
12398static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012399unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400{
12401 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012402 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403
Martin v. Löwis18e16552006-02-15 17:27:45 +000012404 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012405 return NULL;
12406
12407 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012408 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012410 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012412 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413}
12414
Thomas Wouters477c8d52006-05-27 19:21:47 +000012415PyObject *
12416PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12417{
12418 PyObject* str_obj;
12419 PyObject* sep_obj;
12420 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 int kind1, kind2, kind;
12422 void *buf1 = NULL, *buf2 = NULL;
12423 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012424
12425 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012426 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012427 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012428 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012429 if (!sep_obj) {
12430 Py_DECREF(str_obj);
12431 return NULL;
12432 }
12433 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12434 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012435 Py_DECREF(str_obj);
12436 return NULL;
12437 }
12438
Victor Stinner14f8f022011-10-05 20:58:25 +020012439 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012441 kind = Py_MAX(kind1, kind2);
12442 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012444 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 if (!buf1)
12446 goto onError;
12447 buf2 = PyUnicode_DATA(sep_obj);
12448 if (kind2 != kind)
12449 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12450 if (!buf2)
12451 goto onError;
12452 len1 = PyUnicode_GET_LENGTH(str_obj);
12453 len2 = PyUnicode_GET_LENGTH(sep_obj);
12454
Benjamin Petersonead6b532011-12-20 17:23:42 -060012455 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012457 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12458 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12459 else
12460 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461 break;
12462 case PyUnicode_2BYTE_KIND:
12463 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12464 break;
12465 case PyUnicode_4BYTE_KIND:
12466 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12467 break;
12468 default:
12469 assert(0);
12470 out = 0;
12471 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012472
12473 Py_DECREF(sep_obj);
12474 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 if (kind1 != kind)
12476 PyMem_Free(buf1);
12477 if (kind2 != kind)
12478 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012479
12480 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481 onError:
12482 Py_DECREF(sep_obj);
12483 Py_DECREF(str_obj);
12484 if (kind1 != kind && buf1)
12485 PyMem_Free(buf1);
12486 if (kind2 != kind && buf2)
12487 PyMem_Free(buf2);
12488 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012489}
12490
12491
12492PyObject *
12493PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12494{
12495 PyObject* str_obj;
12496 PyObject* sep_obj;
12497 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 int kind1, kind2, kind;
12499 void *buf1 = NULL, *buf2 = NULL;
12500 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012501
12502 str_obj = PyUnicode_FromObject(str_in);
12503 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012505 sep_obj = PyUnicode_FromObject(sep_in);
12506 if (!sep_obj) {
12507 Py_DECREF(str_obj);
12508 return NULL;
12509 }
12510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 kind1 = PyUnicode_KIND(str_in);
12512 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012513 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 buf1 = PyUnicode_DATA(str_in);
12515 if (kind1 != kind)
12516 buf1 = _PyUnicode_AsKind(str_in, kind);
12517 if (!buf1)
12518 goto onError;
12519 buf2 = PyUnicode_DATA(sep_obj);
12520 if (kind2 != kind)
12521 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12522 if (!buf2)
12523 goto onError;
12524 len1 = PyUnicode_GET_LENGTH(str_obj);
12525 len2 = PyUnicode_GET_LENGTH(sep_obj);
12526
Benjamin Petersonead6b532011-12-20 17:23:42 -060012527 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012529 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12530 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12531 else
12532 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 break;
12534 case PyUnicode_2BYTE_KIND:
12535 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12536 break;
12537 case PyUnicode_4BYTE_KIND:
12538 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12539 break;
12540 default:
12541 assert(0);
12542 out = 0;
12543 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012544
12545 Py_DECREF(sep_obj);
12546 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 if (kind1 != kind)
12548 PyMem_Free(buf1);
12549 if (kind2 != kind)
12550 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012551
12552 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 onError:
12554 Py_DECREF(sep_obj);
12555 Py_DECREF(str_obj);
12556 if (kind1 != kind && buf1)
12557 PyMem_Free(buf1);
12558 if (kind2 != kind && buf2)
12559 PyMem_Free(buf2);
12560 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012561}
12562
12563PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012565\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012566Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012567the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012568found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012569
12570static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012571unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012572{
Victor Stinner9310abb2011-10-05 00:59:23 +020012573 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012574}
12575
12576PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012577 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012578\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012579Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012580the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012581separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012582
12583static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012584unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012585{
Victor Stinner9310abb2011-10-05 00:59:23 +020012586 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012587}
12588
Alexander Belopolsky40018472011-02-26 01:02:56 +000012589PyObject *
12590PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012591{
12592 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012593
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012594 s = PyUnicode_FromObject(s);
12595 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012596 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012597 if (sep != NULL) {
12598 sep = PyUnicode_FromObject(sep);
12599 if (sep == NULL) {
12600 Py_DECREF(s);
12601 return NULL;
12602 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012603 }
12604
Victor Stinner9310abb2011-10-05 00:59:23 +020012605 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012606
12607 Py_DECREF(s);
12608 Py_XDECREF(sep);
12609 return result;
12610}
12611
12612PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012613 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012614\n\
12615Return a list of the words in S, using sep as the\n\
12616delimiter string, starting at the end of the string and\n\
12617working to the front. If maxsplit is given, at most maxsplit\n\
12618splits are done. If sep is not specified, any whitespace string\n\
12619is a separator.");
12620
12621static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012622unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012623{
12624 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012625 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012626
Martin v. Löwis18e16552006-02-15 17:27:45 +000012627 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012628 return NULL;
12629
12630 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012632 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012633 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012634 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012635 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012636}
12637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012638PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640\n\
12641Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012642Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012643is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644
12645static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012646unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012648 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012649 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012651 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12652 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653 return NULL;
12654
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012655 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656}
12657
12658static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012659PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012661 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662}
12663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012664PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666\n\
12667Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012668and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669
12670static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012671unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012673 if (PyUnicode_READY(self) == -1)
12674 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012675 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676}
12677
Georg Brandlceee0772007-11-27 23:48:05 +000012678PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012680\n\
12681Return a translation table usable for str.translate().\n\
12682If there is only one argument, it must be a dictionary mapping Unicode\n\
12683ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012684Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012685If there are two arguments, they must be strings of equal length, and\n\
12686in the resulting dictionary, each character in x will be mapped to the\n\
12687character at the same position in y. If there is a third argument, it\n\
12688must be a string, whose characters will be mapped to None in the result.");
12689
12690static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012691unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012692{
12693 PyObject *x, *y = NULL, *z = NULL;
12694 PyObject *new = NULL, *key, *value;
12695 Py_ssize_t i = 0;
12696 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697
Georg Brandlceee0772007-11-27 23:48:05 +000012698 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12699 return NULL;
12700 new = PyDict_New();
12701 if (!new)
12702 return NULL;
12703 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 int x_kind, y_kind, z_kind;
12705 void *x_data, *y_data, *z_data;
12706
Georg Brandlceee0772007-11-27 23:48:05 +000012707 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012708 if (!PyUnicode_Check(x)) {
12709 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12710 "be a string if there is a second argument");
12711 goto err;
12712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012714 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12715 "arguments must have equal length");
12716 goto err;
12717 }
12718 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 x_kind = PyUnicode_KIND(x);
12720 y_kind = PyUnicode_KIND(y);
12721 x_data = PyUnicode_DATA(x);
12722 y_data = PyUnicode_DATA(y);
12723 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12724 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012725 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012726 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012727 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012728 if (!value) {
12729 Py_DECREF(key);
12730 goto err;
12731 }
Georg Brandlceee0772007-11-27 23:48:05 +000012732 res = PyDict_SetItem(new, key, value);
12733 Py_DECREF(key);
12734 Py_DECREF(value);
12735 if (res < 0)
12736 goto err;
12737 }
12738 /* create entries for deleting chars in z */
12739 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 z_kind = PyUnicode_KIND(z);
12741 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012742 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012744 if (!key)
12745 goto err;
12746 res = PyDict_SetItem(new, key, Py_None);
12747 Py_DECREF(key);
12748 if (res < 0)
12749 goto err;
12750 }
12751 }
12752 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 int kind;
12754 void *data;
12755
Georg Brandlceee0772007-11-27 23:48:05 +000012756 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012757 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012758 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12759 "to maketrans it must be a dict");
12760 goto err;
12761 }
12762 /* copy entries into the new dict, converting string keys to int keys */
12763 while (PyDict_Next(x, &i, &key, &value)) {
12764 if (PyUnicode_Check(key)) {
12765 /* convert string keys to integer keys */
12766 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012767 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012768 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12769 "table must be of length 1");
12770 goto err;
12771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 kind = PyUnicode_KIND(key);
12773 data = PyUnicode_DATA(key);
12774 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012775 if (!newkey)
12776 goto err;
12777 res = PyDict_SetItem(new, newkey, value);
12778 Py_DECREF(newkey);
12779 if (res < 0)
12780 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012781 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012782 /* just keep integer keys */
12783 if (PyDict_SetItem(new, key, value) < 0)
12784 goto err;
12785 } else {
12786 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12787 "be strings or integers");
12788 goto err;
12789 }
12790 }
12791 }
12792 return new;
12793 err:
12794 Py_DECREF(new);
12795 return NULL;
12796}
12797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012798PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012799 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800\n\
12801Return a copy of the string S, where all characters have been mapped\n\
12802through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012803Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012804Unmapped characters are left untouched. Characters mapped to None\n\
12805are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806
12807static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811}
12812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012813PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012816Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
12818static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012819unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012821 if (PyUnicode_READY(self) == -1)
12822 return NULL;
12823 if (PyUnicode_IS_ASCII(self))
12824 return ascii_upper_or_lower(self, 0);
12825 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826}
12827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012828PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012831Pad a numeric string S with zeros on the left, to fill a field\n\
12832of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833
12834static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012835unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012837 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012838 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012839 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840 int kind;
12841 void *data;
12842 Py_UCS4 chr;
12843
Martin v. Löwis18e16552006-02-15 17:27:45 +000012844 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845 return NULL;
12846
Benjamin Petersonbac79492012-01-14 13:34:47 -050012847 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849
Victor Stinnerc4b49542011-12-11 22:44:26 +010012850 if (PyUnicode_GET_LENGTH(self) >= width)
12851 return unicode_result_unchanged(self);
12852
12853 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854
12855 u = pad(self, fill, 0, '0');
12856
Walter Dörwald068325e2002-04-15 13:36:47 +000012857 if (u == NULL)
12858 return NULL;
12859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860 kind = PyUnicode_KIND(u);
12861 data = PyUnicode_DATA(u);
12862 chr = PyUnicode_READ(kind, data, fill);
12863
12864 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 PyUnicode_WRITE(kind, data, 0, chr);
12867 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868 }
12869
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012870 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012871 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873
12874#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012875static PyObject *
12876unicode__decimal2ascii(PyObject *self)
12877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012879}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880#endif
12881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012882PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012883 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012885Return True if S starts with the specified prefix, False otherwise.\n\
12886With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012887With optional end, stop comparing S at that position.\n\
12888prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889
12890static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012891unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012892 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012894 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012895 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012896 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012897 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012898 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899
Jesus Ceaac451502011-04-20 17:09:23 +020012900 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012901 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012902 if (PyTuple_Check(subobj)) {
12903 Py_ssize_t i;
12904 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012905 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012906 if (substring == NULL)
12907 return NULL;
12908 result = tailmatch(self, substring, start, end, -1);
12909 Py_DECREF(substring);
12910 if (result) {
12911 Py_RETURN_TRUE;
12912 }
12913 }
12914 /* nothing matched */
12915 Py_RETURN_FALSE;
12916 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012917 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012918 if (substring == NULL) {
12919 if (PyErr_ExceptionMatches(PyExc_TypeError))
12920 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12921 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012922 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012923 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012924 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012926 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927}
12928
12929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012930PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012931 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012933Return True if S ends with the specified suffix, False otherwise.\n\
12934With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012935With optional end, stop comparing S at that position.\n\
12936suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937
12938static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012939unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012940 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012942 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012943 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012944 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012945 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012946 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947
Jesus Ceaac451502011-04-20 17:09:23 +020012948 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012949 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012950 if (PyTuple_Check(subobj)) {
12951 Py_ssize_t i;
12952 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012953 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012954 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012955 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012956 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012957 result = tailmatch(self, substring, start, end, +1);
12958 Py_DECREF(substring);
12959 if (result) {
12960 Py_RETURN_TRUE;
12961 }
12962 }
12963 Py_RETURN_FALSE;
12964 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012965 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012966 if (substring == NULL) {
12967 if (PyErr_ExceptionMatches(PyExc_TypeError))
12968 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12969 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012970 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012971 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012972 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012974 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012975}
12976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012978
12979PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012980 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012981\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012982Return a formatted version of S, using substitutions from args and kwargs.\n\
12983The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012984
Eric Smith27bbca62010-11-04 17:06:58 +000012985PyDoc_STRVAR(format_map__doc__,
12986 "S.format_map(mapping) -> str\n\
12987\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012988Return a formatted version of S, using substitutions from mapping.\n\
12989The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012990
Eric Smith4a7d76d2008-05-30 18:10:19 +000012991static PyObject *
12992unicode__format__(PyObject* self, PyObject* args)
12993{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012994 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012995
12996 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12997 return NULL;
12998
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012999 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013001 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013002}
13003
Eric Smith8c663262007-08-25 02:26:07 +000013004PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013005 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013006\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013007Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013008
13009static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013010unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 Py_ssize_t size;
13013
13014 /* If it's a compact object, account for base structure +
13015 character data. */
13016 if (PyUnicode_IS_COMPACT_ASCII(v))
13017 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13018 else if (PyUnicode_IS_COMPACT(v))
13019 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013020 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 else {
13022 /* If it is a two-block object, account for base object, and
13023 for character block if present. */
13024 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013025 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013027 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 }
13029 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013030 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013031 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013033 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013034 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035
13036 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013037}
13038
13039PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013040 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013041
13042static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013043unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013044{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013045 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 if (!copy)
13047 return NULL;
13048 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013049}
13050
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051static PyMethodDef unicode_methods[] = {
13052
13053 /* Order is according to common usage: often used methods should
13054 appear first, since lookup is done sequentially. */
13055
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013056 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013057 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13058 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013059 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013060 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13061 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013062 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013063 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13064 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13065 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13066 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13067 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013068 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013069 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13070 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13071 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013072 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013073 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13074 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13075 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013076 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013077 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013078 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013079 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013080 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13081 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13082 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13083 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13084 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13085 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13086 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13087 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13088 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13089 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13090 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13091 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13092 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13093 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013094 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013095 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013096 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013097 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013098 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013099 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013100 {"maketrans", (PyCFunction) unicode_maketrans,
13101 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013102 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013103#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013104 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013105 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106#endif
13107
Benjamin Peterson14339b62009-01-31 16:36:08 +000013108 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 {NULL, NULL}
13110};
13111
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013112static PyObject *
13113unicode_mod(PyObject *v, PyObject *w)
13114{
Brian Curtindfc80e32011-08-10 20:28:54 -050013115 if (!PyUnicode_Check(v))
13116 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013117 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013118}
13119
13120static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013121 0, /*nb_add*/
13122 0, /*nb_subtract*/
13123 0, /*nb_multiply*/
13124 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013125};
13126
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013128 (lenfunc) unicode_length, /* sq_length */
13129 PyUnicode_Concat, /* sq_concat */
13130 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13131 (ssizeargfunc) unicode_getitem, /* sq_item */
13132 0, /* sq_slice */
13133 0, /* sq_ass_item */
13134 0, /* sq_ass_slice */
13135 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136};
13137
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013138static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013139unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 if (PyUnicode_READY(self) == -1)
13142 return NULL;
13143
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013144 if (PyIndex_Check(item)) {
13145 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013146 if (i == -1 && PyErr_Occurred())
13147 return NULL;
13148 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013150 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013151 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013152 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013153 PyObject *result;
13154 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013155 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013156 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013159 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013160 return NULL;
13161 }
13162
13163 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013164 Py_INCREF(unicode_empty);
13165 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013167 slicelength == PyUnicode_GET_LENGTH(self)) {
13168 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013169 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013170 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013171 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013172 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013173 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013174 src_kind = PyUnicode_KIND(self);
13175 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013176 if (!PyUnicode_IS_ASCII(self)) {
13177 kind_limit = kind_maxchar_limit(src_kind);
13178 max_char = 0;
13179 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13180 ch = PyUnicode_READ(src_kind, src_data, cur);
13181 if (ch > max_char) {
13182 max_char = ch;
13183 if (max_char >= kind_limit)
13184 break;
13185 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013186 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013187 }
Victor Stinner55c99112011-10-13 01:17:06 +020013188 else
13189 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013190 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013191 if (result == NULL)
13192 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013193 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013194 dest_data = PyUnicode_DATA(result);
13195
13196 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013197 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13198 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013199 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013200 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013201 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013202 } else {
13203 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13204 return NULL;
13205 }
13206}
13207
13208static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013209 (lenfunc)unicode_length, /* mp_length */
13210 (binaryfunc)unicode_subscript, /* mp_subscript */
13211 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013212};
13213
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215/* Helpers for PyUnicode_Format() */
13216
13217static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013218getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013220 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013222 (*p_argidx)++;
13223 if (arglen < 0)
13224 return args;
13225 else
13226 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227 }
13228 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230 return NULL;
13231}
13232
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013233/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013235static PyObject *
13236formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013237{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013238 char *p;
13239 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013240 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013241
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242 x = PyFloat_AsDouble(v);
13243 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013244 return NULL;
13245
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013248
Eric Smith0923d1d2009-04-16 20:16:10 +000013249 p = PyOS_double_to_string(x, type, prec,
13250 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013251 if (p == NULL)
13252 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013253 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013254 PyMem_Free(p);
13255 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256}
13257
Tim Peters38fd5b62000-09-21 05:43:11 +000013258static PyObject*
13259formatlong(PyObject *val, int flags, int prec, int type)
13260{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013261 char *buf;
13262 int len;
13263 PyObject *str; /* temporary string object. */
13264 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013265
Benjamin Peterson14339b62009-01-31 16:36:08 +000013266 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13267 if (!str)
13268 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013270 Py_DECREF(str);
13271 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013272}
13273
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013274static Py_UCS4
13275formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013276{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013277 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013278 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013280 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013281 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 goto onError;
13283 }
13284 else {
13285 /* Integer input truncated to a character */
13286 long x;
13287 x = PyLong_AsLong(v);
13288 if (x == -1 && PyErr_Occurred())
13289 goto onError;
13290
Victor Stinner8faf8212011-12-08 22:14:11 +010013291 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 PyErr_SetString(PyExc_OverflowError,
13293 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013294 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 }
13296
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013297 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013298 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013299
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013301 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013302 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013303 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304}
13305
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013306static int
13307repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13308{
13309 int r;
13310 assert(count > 0);
13311 assert(PyUnicode_Check(obj));
13312 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013313 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013314 if (repeated == NULL)
13315 return -1;
13316 r = _PyAccu_Accumulate(acc, repeated);
13317 Py_DECREF(repeated);
13318 return r;
13319 }
13320 else {
13321 do {
13322 if (_PyAccu_Accumulate(acc, obj))
13323 return -1;
13324 } while (--count);
13325 return 0;
13326 }
13327}
13328
Alexander Belopolsky40018472011-02-26 01:02:56 +000013329PyObject *
13330PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332 void *fmt;
13333 int fmtkind;
13334 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013335 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013336 int r;
13337 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013340 PyObject *temp = NULL;
13341 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013342 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013343 _PyAccu acc;
13344 static PyObject *plus, *minus, *blank, *zero, *percent;
13345
13346 if (!plus && !(plus = get_latin1_char('+')))
13347 return NULL;
13348 if (!minus && !(minus = get_latin1_char('-')))
13349 return NULL;
13350 if (!blank && !(blank = get_latin1_char(' ')))
13351 return NULL;
13352 if (!zero && !(zero = get_latin1_char('0')))
13353 return NULL;
13354 if (!percent && !(percent = get_latin1_char('%')))
13355 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013356
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013358 PyErr_BadInternalCall();
13359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013361 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013362 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013363 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013364 if (PyUnicode_READY(uformat) == -1)
13365 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013366 if (_PyAccu_Init(&acc))
13367 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013368 fmt = PyUnicode_DATA(uformat);
13369 fmtkind = PyUnicode_KIND(uformat);
13370 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13371 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013374 arglen = PyTuple_Size(args);
13375 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376 }
13377 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 arglen = -1;
13379 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013380 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013381 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013382 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384
13385 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013386 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013387 PyObject *nonfmt;
13388 Py_ssize_t nonfmtpos;
13389 nonfmtpos = fmtpos++;
13390 while (fmtcnt >= 0 &&
13391 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13392 fmtpos++;
13393 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013394 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013395 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013396 if (nonfmt == NULL)
13397 goto onError;
13398 r = _PyAccu_Accumulate(&acc, nonfmt);
13399 Py_DECREF(nonfmt);
13400 if (r)
13401 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013402 }
13403 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 /* Got a format specifier */
13405 int flags = 0;
13406 Py_ssize_t width = -1;
13407 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013408 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013409 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 int isnumok;
13411 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013412 void *pbuf = NULL;
13413 Py_ssize_t pindex, len;
13414 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 fmtpos++;
13417 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13418 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 Py_ssize_t keylen;
13420 PyObject *key;
13421 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013422
Benjamin Peterson29060642009-01-31 22:14:21 +000013423 if (dict == NULL) {
13424 PyErr_SetString(PyExc_TypeError,
13425 "format requires a mapping");
13426 goto onError;
13427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013428 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013430 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013431 /* Skip over balanced parentheses */
13432 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013433 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013437 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013439 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 if (fmtcnt < 0 || pcount > 0) {
13441 PyErr_SetString(PyExc_ValueError,
13442 "incomplete format key");
13443 goto onError;
13444 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013445 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013446 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 if (key == NULL)
13448 goto onError;
13449 if (args_owned) {
13450 Py_DECREF(args);
13451 args_owned = 0;
13452 }
13453 args = PyObject_GetItem(dict, key);
13454 Py_DECREF(key);
13455 if (args == NULL) {
13456 goto onError;
13457 }
13458 args_owned = 1;
13459 arglen = -1;
13460 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013461 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013462 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 case '-': flags |= F_LJUST; continue;
13465 case '+': flags |= F_SIGN; continue;
13466 case ' ': flags |= F_BLANK; continue;
13467 case '#': flags |= F_ALT; continue;
13468 case '0': flags |= F_ZERO; continue;
13469 }
13470 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013471 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 if (c == '*') {
13473 v = getnextarg(args, arglen, &argidx);
13474 if (v == NULL)
13475 goto onError;
13476 if (!PyLong_Check(v)) {
13477 PyErr_SetString(PyExc_TypeError,
13478 "* wants int");
13479 goto onError;
13480 }
13481 width = PyLong_AsLong(v);
13482 if (width == -1 && PyErr_Occurred())
13483 goto onError;
13484 if (width < 0) {
13485 flags |= F_LJUST;
13486 width = -width;
13487 }
13488 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 }
13491 else if (c >= '0' && c <= '9') {
13492 width = c - '0';
13493 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013494 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013495 if (c < '0' || c > '9')
13496 break;
13497 if ((width*10) / 10 != width) {
13498 PyErr_SetString(PyExc_ValueError,
13499 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013500 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 }
13502 width = width*10 + (c - '0');
13503 }
13504 }
13505 if (c == '.') {
13506 prec = 0;
13507 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 if (c == '*') {
13510 v = getnextarg(args, arglen, &argidx);
13511 if (v == NULL)
13512 goto onError;
13513 if (!PyLong_Check(v)) {
13514 PyErr_SetString(PyExc_TypeError,
13515 "* wants int");
13516 goto onError;
13517 }
13518 prec = PyLong_AsLong(v);
13519 if (prec == -1 && PyErr_Occurred())
13520 goto onError;
13521 if (prec < 0)
13522 prec = 0;
13523 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013524 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013525 }
13526 else if (c >= '0' && c <= '9') {
13527 prec = c - '0';
13528 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013529 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013530 if (c < '0' || c > '9')
13531 break;
13532 if ((prec*10) / 10 != prec) {
13533 PyErr_SetString(PyExc_ValueError,
13534 "prec too big");
13535 goto onError;
13536 }
13537 prec = prec*10 + (c - '0');
13538 }
13539 }
13540 } /* prec */
13541 if (fmtcnt >= 0) {
13542 if (c == 'h' || c == 'l' || c == 'L') {
13543 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013544 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 }
13546 }
13547 if (fmtcnt < 0) {
13548 PyErr_SetString(PyExc_ValueError,
13549 "incomplete format");
13550 goto onError;
13551 }
13552 if (c != '%') {
13553 v = getnextarg(args, arglen, &argidx);
13554 if (v == NULL)
13555 goto onError;
13556 }
13557 sign = 0;
13558 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013559 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 switch (c) {
13561
13562 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013563 _PyAccu_Accumulate(&acc, percent);
13564 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013565
13566 case 's':
13567 case 'r':
13568 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013569 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013570 temp = v;
13571 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013572 }
13573 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 if (c == 's')
13575 temp = PyObject_Str(v);
13576 else if (c == 'r')
13577 temp = PyObject_Repr(v);
13578 else
13579 temp = PyObject_ASCII(v);
13580 if (temp == NULL)
13581 goto onError;
13582 if (PyUnicode_Check(temp))
13583 /* nothing to do */;
13584 else {
13585 Py_DECREF(temp);
13586 PyErr_SetString(PyExc_TypeError,
13587 "%s argument has non-string str()");
13588 goto onError;
13589 }
13590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013591 if (PyUnicode_READY(temp) == -1) {
13592 Py_CLEAR(temp);
13593 goto onError;
13594 }
13595 pbuf = PyUnicode_DATA(temp);
13596 kind = PyUnicode_KIND(temp);
13597 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 if (prec >= 0 && len > prec)
13599 len = prec;
13600 break;
13601
13602 case 'i':
13603 case 'd':
13604 case 'u':
13605 case 'o':
13606 case 'x':
13607 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 isnumok = 0;
13609 if (PyNumber_Check(v)) {
13610 PyObject *iobj=NULL;
13611
13612 if (PyLong_Check(v)) {
13613 iobj = v;
13614 Py_INCREF(iobj);
13615 }
13616 else {
13617 iobj = PyNumber_Long(v);
13618 }
13619 if (iobj!=NULL) {
13620 if (PyLong_Check(iobj)) {
13621 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013622 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013623 Py_DECREF(iobj);
13624 if (!temp)
13625 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013626 if (PyUnicode_READY(temp) == -1) {
13627 Py_CLEAR(temp);
13628 goto onError;
13629 }
13630 pbuf = PyUnicode_DATA(temp);
13631 kind = PyUnicode_KIND(temp);
13632 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 sign = 1;
13634 }
13635 else {
13636 Py_DECREF(iobj);
13637 }
13638 }
13639 }
13640 if (!isnumok) {
13641 PyErr_Format(PyExc_TypeError,
13642 "%%%c format: a number is required, "
13643 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13644 goto onError;
13645 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013646 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013647 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013648 fillobj = zero;
13649 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013650 break;
13651
13652 case 'e':
13653 case 'E':
13654 case 'f':
13655 case 'F':
13656 case 'g':
13657 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013658 temp = formatfloat(v, flags, prec, c);
13659 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013661 if (PyUnicode_READY(temp) == -1) {
13662 Py_CLEAR(temp);
13663 goto onError;
13664 }
13665 pbuf = PyUnicode_DATA(temp);
13666 kind = PyUnicode_KIND(temp);
13667 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013669 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013671 fillobj = zero;
13672 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013673 break;
13674
13675 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013676 {
13677 Py_UCS4 ch = formatchar(v);
13678 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013679 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013680 temp = _PyUnicode_FromUCS4(&ch, 1);
13681 if (temp == NULL)
13682 goto onError;
13683 pbuf = PyUnicode_DATA(temp);
13684 kind = PyUnicode_KIND(temp);
13685 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013686 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013687 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013688
13689 default:
13690 PyErr_Format(PyExc_ValueError,
13691 "unsupported format character '%c' (0x%x) "
13692 "at index %zd",
13693 (31<=c && c<=126) ? (char)c : '?',
13694 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013695 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013696 goto onError;
13697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013698 /* pbuf is initialized here. */
13699 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013700 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013701 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13702 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013703 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013704 pindex++;
13705 }
13706 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13707 signobj = plus;
13708 len--;
13709 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013710 }
13711 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013712 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013714 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013715 else
13716 sign = 0;
13717 }
13718 if (width < len)
13719 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013720 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013721 if (fill != ' ') {
13722 assert(signobj != NULL);
13723 if (_PyAccu_Accumulate(&acc, signobj))
13724 goto onError;
13725 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 if (width > len)
13727 width--;
13728 }
13729 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013730 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013731 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013732 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013733 second = get_latin1_char(
13734 PyUnicode_READ(kind, pbuf, pindex + 1));
13735 pindex += 2;
13736 if (second == NULL ||
13737 _PyAccu_Accumulate(&acc, zero) ||
13738 _PyAccu_Accumulate(&acc, second))
13739 goto onError;
13740 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013741 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013742 width -= 2;
13743 if (width < 0)
13744 width = 0;
13745 len -= 2;
13746 }
13747 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013748 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013749 if (repeat_accumulate(&acc, fillobj, width - len))
13750 goto onError;
13751 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 }
13753 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013754 if (sign) {
13755 assert(signobj != NULL);
13756 if (_PyAccu_Accumulate(&acc, signobj))
13757 goto onError;
13758 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013759 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013760 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13761 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013762 second = get_latin1_char(
13763 PyUnicode_READ(kind, pbuf, pindex + 1));
13764 pindex += 2;
13765 if (second == NULL ||
13766 _PyAccu_Accumulate(&acc, zero) ||
13767 _PyAccu_Accumulate(&acc, second))
13768 goto onError;
13769 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013770 }
13771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013772 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013773 if (temp != NULL) {
13774 assert(pbuf == PyUnicode_DATA(temp));
13775 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013776 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013777 else {
13778 const char *p = (const char *) pbuf;
13779 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013780 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013781 v = PyUnicode_FromKindAndData(kind, p, len);
13782 }
13783 if (v == NULL)
13784 goto onError;
13785 r = _PyAccu_Accumulate(&acc, v);
13786 Py_DECREF(v);
13787 if (r)
13788 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013789 if (width > len && repeat_accumulate(&acc, blank, width - len))
13790 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013791 if (dict && (argidx < arglen) && c != '%') {
13792 PyErr_SetString(PyExc_TypeError,
13793 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013794 goto onError;
13795 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013796 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013797 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798 } /* until end */
13799 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013800 PyErr_SetString(PyExc_TypeError,
13801 "not all arguments converted during string formatting");
13802 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013803 }
13804
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013805 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013807 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808 }
13809 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013810 Py_XDECREF(temp);
13811 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013812 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013813
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013815 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013816 Py_XDECREF(temp);
13817 Py_XDECREF(second);
13818 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013820 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013821 }
13822 return NULL;
13823}
13824
Jeremy Hylton938ace62002-07-17 16:30:39 +000013825static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013826unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13827
Tim Peters6d6c1a32001-08-02 04:15:00 +000013828static PyObject *
13829unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13830{
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013832 static char *kwlist[] = {"object", "encoding", "errors", 0};
13833 char *encoding = NULL;
13834 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013835
Benjamin Peterson14339b62009-01-31 16:36:08 +000013836 if (type != &PyUnicode_Type)
13837 return unicode_subtype_new(type, args, kwds);
13838 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013839 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013840 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013841 if (x == NULL) {
13842 Py_INCREF(unicode_empty);
13843 return unicode_empty;
13844 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013845 if (encoding == NULL && errors == NULL)
13846 return PyObject_Str(x);
13847 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013849}
13850
Guido van Rossume023fe02001-08-30 03:12:59 +000013851static PyObject *
13852unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13853{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013854 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013855 Py_ssize_t length, char_size;
13856 int share_wstr, share_utf8;
13857 unsigned int kind;
13858 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013859
Benjamin Peterson14339b62009-01-31 16:36:08 +000013860 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013861
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013862 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013863 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013864 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013865 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013866 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013867 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013868 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013869 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013870
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013871 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013872 if (self == NULL) {
13873 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013874 return NULL;
13875 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013876 kind = PyUnicode_KIND(unicode);
13877 length = PyUnicode_GET_LENGTH(unicode);
13878
13879 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013880#ifdef Py_DEBUG
13881 _PyUnicode_HASH(self) = -1;
13882#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013883 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013884#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013885 _PyUnicode_STATE(self).interned = 0;
13886 _PyUnicode_STATE(self).kind = kind;
13887 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013888 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013889 _PyUnicode_STATE(self).ready = 1;
13890 _PyUnicode_WSTR(self) = NULL;
13891 _PyUnicode_UTF8_LENGTH(self) = 0;
13892 _PyUnicode_UTF8(self) = NULL;
13893 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013894 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013895
13896 share_utf8 = 0;
13897 share_wstr = 0;
13898 if (kind == PyUnicode_1BYTE_KIND) {
13899 char_size = 1;
13900 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13901 share_utf8 = 1;
13902 }
13903 else if (kind == PyUnicode_2BYTE_KIND) {
13904 char_size = 2;
13905 if (sizeof(wchar_t) == 2)
13906 share_wstr = 1;
13907 }
13908 else {
13909 assert(kind == PyUnicode_4BYTE_KIND);
13910 char_size = 4;
13911 if (sizeof(wchar_t) == 4)
13912 share_wstr = 1;
13913 }
13914
13915 /* Ensure we won't overflow the length. */
13916 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13917 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013918 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013919 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013920 data = PyObject_MALLOC((length + 1) * char_size);
13921 if (data == NULL) {
13922 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013923 goto onError;
13924 }
13925
Victor Stinnerc3c74152011-10-02 20:39:55 +020013926 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013927 if (share_utf8) {
13928 _PyUnicode_UTF8_LENGTH(self) = length;
13929 _PyUnicode_UTF8(self) = data;
13930 }
13931 if (share_wstr) {
13932 _PyUnicode_WSTR_LENGTH(self) = length;
13933 _PyUnicode_WSTR(self) = (wchar_t *)data;
13934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013935
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013936 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013937 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013938 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013939#ifdef Py_DEBUG
13940 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13941#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013942 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013943 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013944
13945onError:
13946 Py_DECREF(unicode);
13947 Py_DECREF(self);
13948 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013949}
13950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013951PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013952 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013953\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013954Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013955encoding defaults to the current default string encoding.\n\
13956errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013957
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013958static PyObject *unicode_iter(PyObject *seq);
13959
Guido van Rossumd57fd912000-03-10 22:53:23 +000013960PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013961 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013962 "str", /* tp_name */
13963 sizeof(PyUnicodeObject), /* tp_size */
13964 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013965 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 (destructor)unicode_dealloc, /* tp_dealloc */
13967 0, /* tp_print */
13968 0, /* tp_getattr */
13969 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013970 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 unicode_repr, /* tp_repr */
13972 &unicode_as_number, /* tp_as_number */
13973 &unicode_as_sequence, /* tp_as_sequence */
13974 &unicode_as_mapping, /* tp_as_mapping */
13975 (hashfunc) unicode_hash, /* tp_hash*/
13976 0, /* tp_call*/
13977 (reprfunc) unicode_str, /* tp_str */
13978 PyObject_GenericGetAttr, /* tp_getattro */
13979 0, /* tp_setattro */
13980 0, /* tp_as_buffer */
13981 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013982 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013983 unicode_doc, /* tp_doc */
13984 0, /* tp_traverse */
13985 0, /* tp_clear */
13986 PyUnicode_RichCompare, /* tp_richcompare */
13987 0, /* tp_weaklistoffset */
13988 unicode_iter, /* tp_iter */
13989 0, /* tp_iternext */
13990 unicode_methods, /* tp_methods */
13991 0, /* tp_members */
13992 0, /* tp_getset */
13993 &PyBaseObject_Type, /* tp_base */
13994 0, /* tp_dict */
13995 0, /* tp_descr_get */
13996 0, /* tp_descr_set */
13997 0, /* tp_dictoffset */
13998 0, /* tp_init */
13999 0, /* tp_alloc */
14000 unicode_new, /* tp_new */
14001 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014002};
14003
14004/* Initialize the Unicode implementation */
14005
Victor Stinner3a50e702011-10-18 21:21:00 +020014006int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014007{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014008 int i;
14009
Thomas Wouters477c8d52006-05-27 19:21:47 +000014010 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014011 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014012 0x000A, /* LINE FEED */
14013 0x000D, /* CARRIAGE RETURN */
14014 0x001C, /* FILE SEPARATOR */
14015 0x001D, /* GROUP SEPARATOR */
14016 0x001E, /* RECORD SEPARATOR */
14017 0x0085, /* NEXT LINE */
14018 0x2028, /* LINE SEPARATOR */
14019 0x2029, /* PARAGRAPH SEPARATOR */
14020 };
14021
Fred Drakee4315f52000-05-09 19:53:39 +000014022 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014023 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014024 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014025 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014026 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014027
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014028 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014029 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014030 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014031 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014032
14033 /* initialize the linebreak bloom filter */
14034 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014035 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014036 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014037
14038 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014039
14040#ifdef HAVE_MBCS
14041 winver.dwOSVersionInfoSize = sizeof(winver);
14042 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14043 PyErr_SetFromWindowsErr(0);
14044 return -1;
14045 }
14046#endif
14047 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014048}
14049
14050/* Finalize the Unicode implementation */
14051
Christian Heimesa156e092008-02-16 07:38:31 +000014052int
14053PyUnicode_ClearFreeList(void)
14054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014055 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014056}
14057
Guido van Rossumd57fd912000-03-10 22:53:23 +000014058void
Thomas Wouters78890102000-07-22 19:25:51 +000014059_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014060{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014061 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014062
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014063 Py_XDECREF(unicode_empty);
14064 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014065
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014066 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014067 if (unicode_latin1[i]) {
14068 Py_DECREF(unicode_latin1[i]);
14069 unicode_latin1[i] = NULL;
14070 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014071 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014072 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014073 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014074}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014075
Walter Dörwald16807132007-05-25 13:52:07 +000014076void
14077PyUnicode_InternInPlace(PyObject **p)
14078{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014079 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014080 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014081#ifdef Py_DEBUG
14082 assert(s != NULL);
14083 assert(_PyUnicode_CHECK(s));
14084#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014085 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014086 return;
14087#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014088 /* If it's a subclass, we don't really know what putting
14089 it in the interned dict might do. */
14090 if (!PyUnicode_CheckExact(s))
14091 return;
14092 if (PyUnicode_CHECK_INTERNED(s))
14093 return;
14094 if (interned == NULL) {
14095 interned = PyDict_New();
14096 if (interned == NULL) {
14097 PyErr_Clear(); /* Don't leave an exception */
14098 return;
14099 }
14100 }
14101 /* It might be that the GetItem call fails even
14102 though the key is present in the dictionary,
14103 namely when this happens during a stack overflow. */
14104 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014105 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014106 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014107
Benjamin Peterson29060642009-01-31 22:14:21 +000014108 if (t) {
14109 Py_INCREF(t);
14110 Py_DECREF(*p);
14111 *p = t;
14112 return;
14113 }
Walter Dörwald16807132007-05-25 13:52:07 +000014114
Benjamin Peterson14339b62009-01-31 16:36:08 +000014115 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014116 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 PyErr_Clear();
14118 PyThreadState_GET()->recursion_critical = 0;
14119 return;
14120 }
14121 PyThreadState_GET()->recursion_critical = 0;
14122 /* The two references in interned are not counted by refcnt.
14123 The deallocator will take care of this */
14124 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014125 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014126}
14127
14128void
14129PyUnicode_InternImmortal(PyObject **p)
14130{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014131 PyUnicode_InternInPlace(p);
14132 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014133 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014134 Py_INCREF(*p);
14135 }
Walter Dörwald16807132007-05-25 13:52:07 +000014136}
14137
14138PyObject *
14139PyUnicode_InternFromString(const char *cp)
14140{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014141 PyObject *s = PyUnicode_FromString(cp);
14142 if (s == NULL)
14143 return NULL;
14144 PyUnicode_InternInPlace(&s);
14145 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014146}
14147
Alexander Belopolsky40018472011-02-26 01:02:56 +000014148void
14149_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014151 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014152 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014153 Py_ssize_t i, n;
14154 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014155
Benjamin Peterson14339b62009-01-31 16:36:08 +000014156 if (interned == NULL || !PyDict_Check(interned))
14157 return;
14158 keys = PyDict_Keys(interned);
14159 if (keys == NULL || !PyList_Check(keys)) {
14160 PyErr_Clear();
14161 return;
14162 }
Walter Dörwald16807132007-05-25 13:52:07 +000014163
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14165 detector, interned unicode strings are not forcibly deallocated;
14166 rather, we give them their stolen references back, and then clear
14167 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014168
Benjamin Peterson14339b62009-01-31 16:36:08 +000014169 n = PyList_GET_SIZE(keys);
14170 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014171 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014172 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014173 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014174 if (PyUnicode_READY(s) == -1) {
14175 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014176 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014178 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014179 case SSTATE_NOT_INTERNED:
14180 /* XXX Shouldn't happen */
14181 break;
14182 case SSTATE_INTERNED_IMMORTAL:
14183 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014184 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 break;
14186 case SSTATE_INTERNED_MORTAL:
14187 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014188 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014189 break;
14190 default:
14191 Py_FatalError("Inconsistent interned string state.");
14192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014193 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014194 }
14195 fprintf(stderr, "total size of all interned strings: "
14196 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14197 "mortal/immortal\n", mortal_size, immortal_size);
14198 Py_DECREF(keys);
14199 PyDict_Clear(interned);
14200 Py_DECREF(interned);
14201 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014202}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014203
14204
14205/********************* Unicode Iterator **************************/
14206
14207typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014208 PyObject_HEAD
14209 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014210 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014211} unicodeiterobject;
14212
14213static void
14214unicodeiter_dealloc(unicodeiterobject *it)
14215{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014216 _PyObject_GC_UNTRACK(it);
14217 Py_XDECREF(it->it_seq);
14218 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014219}
14220
14221static int
14222unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14223{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014224 Py_VISIT(it->it_seq);
14225 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014226}
14227
14228static PyObject *
14229unicodeiter_next(unicodeiterobject *it)
14230{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014231 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014232
Benjamin Peterson14339b62009-01-31 16:36:08 +000014233 assert(it != NULL);
14234 seq = it->it_seq;
14235 if (seq == NULL)
14236 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014237 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014239 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14240 int kind = PyUnicode_KIND(seq);
14241 void *data = PyUnicode_DATA(seq);
14242 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14243 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014244 if (item != NULL)
14245 ++it->it_index;
14246 return item;
14247 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014248
Benjamin Peterson14339b62009-01-31 16:36:08 +000014249 Py_DECREF(seq);
14250 it->it_seq = NULL;
14251 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014252}
14253
14254static PyObject *
14255unicodeiter_len(unicodeiterobject *it)
14256{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014257 Py_ssize_t len = 0;
14258 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014259 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014260 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014261}
14262
14263PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14264
14265static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014266 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014267 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014269};
14270
14271PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014272 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14273 "str_iterator", /* tp_name */
14274 sizeof(unicodeiterobject), /* tp_basicsize */
14275 0, /* tp_itemsize */
14276 /* methods */
14277 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14278 0, /* tp_print */
14279 0, /* tp_getattr */
14280 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014281 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014282 0, /* tp_repr */
14283 0, /* tp_as_number */
14284 0, /* tp_as_sequence */
14285 0, /* tp_as_mapping */
14286 0, /* tp_hash */
14287 0, /* tp_call */
14288 0, /* tp_str */
14289 PyObject_GenericGetAttr, /* tp_getattro */
14290 0, /* tp_setattro */
14291 0, /* tp_as_buffer */
14292 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14293 0, /* tp_doc */
14294 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14295 0, /* tp_clear */
14296 0, /* tp_richcompare */
14297 0, /* tp_weaklistoffset */
14298 PyObject_SelfIter, /* tp_iter */
14299 (iternextfunc)unicodeiter_next, /* tp_iternext */
14300 unicodeiter_methods, /* tp_methods */
14301 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014302};
14303
14304static PyObject *
14305unicode_iter(PyObject *seq)
14306{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014307 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014308
Benjamin Peterson14339b62009-01-31 16:36:08 +000014309 if (!PyUnicode_Check(seq)) {
14310 PyErr_BadInternalCall();
14311 return NULL;
14312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014313 if (PyUnicode_READY(seq) == -1)
14314 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014315 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14316 if (it == NULL)
14317 return NULL;
14318 it->it_index = 0;
14319 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014320 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014321 _PyObject_GC_TRACK(it);
14322 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014323}
14324
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014325
14326size_t
14327Py_UNICODE_strlen(const Py_UNICODE *u)
14328{
14329 int res = 0;
14330 while(*u++)
14331 res++;
14332 return res;
14333}
14334
14335Py_UNICODE*
14336Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14337{
14338 Py_UNICODE *u = s1;
14339 while ((*u++ = *s2++));
14340 return s1;
14341}
14342
14343Py_UNICODE*
14344Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14345{
14346 Py_UNICODE *u = s1;
14347 while ((*u++ = *s2++))
14348 if (n-- == 0)
14349 break;
14350 return s1;
14351}
14352
14353Py_UNICODE*
14354Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14355{
14356 Py_UNICODE *u1 = s1;
14357 u1 += Py_UNICODE_strlen(u1);
14358 Py_UNICODE_strcpy(u1, s2);
14359 return s1;
14360}
14361
14362int
14363Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14364{
14365 while (*s1 && *s2 && *s1 == *s2)
14366 s1++, s2++;
14367 if (*s1 && *s2)
14368 return (*s1 < *s2) ? -1 : +1;
14369 if (*s1)
14370 return 1;
14371 if (*s2)
14372 return -1;
14373 return 0;
14374}
14375
14376int
14377Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14378{
14379 register Py_UNICODE u1, u2;
14380 for (; n != 0; n--) {
14381 u1 = *s1;
14382 u2 = *s2;
14383 if (u1 != u2)
14384 return (u1 < u2) ? -1 : +1;
14385 if (u1 == '\0')
14386 return 0;
14387 s1++;
14388 s2++;
14389 }
14390 return 0;
14391}
14392
14393Py_UNICODE*
14394Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14395{
14396 const Py_UNICODE *p;
14397 for (p = s; *p; p++)
14398 if (*p == c)
14399 return (Py_UNICODE*)p;
14400 return NULL;
14401}
14402
14403Py_UNICODE*
14404Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14405{
14406 const Py_UNICODE *p;
14407 p = s + Py_UNICODE_strlen(s);
14408 while (p != s) {
14409 p--;
14410 if (*p == c)
14411 return (Py_UNICODE*)p;
14412 }
14413 return NULL;
14414}
Victor Stinner331ea922010-08-10 16:37:20 +000014415
Victor Stinner71133ff2010-09-01 23:43:53 +000014416Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014417PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014418{
Victor Stinner577db2c2011-10-11 22:12:48 +020014419 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014420 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014422 if (!PyUnicode_Check(unicode)) {
14423 PyErr_BadArgument();
14424 return NULL;
14425 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014426 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014427 if (u == NULL)
14428 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014429 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014430 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014431 PyErr_NoMemory();
14432 return NULL;
14433 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014434 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014435 size *= sizeof(Py_UNICODE);
14436 copy = PyMem_Malloc(size);
14437 if (copy == NULL) {
14438 PyErr_NoMemory();
14439 return NULL;
14440 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014441 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014442 return copy;
14443}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014444
Georg Brandl66c221e2010-10-14 07:04:07 +000014445/* A _string module, to export formatter_parser and formatter_field_name_split
14446 to the string.Formatter class implemented in Python. */
14447
14448static PyMethodDef _string_methods[] = {
14449 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14450 METH_O, PyDoc_STR("split the argument as a field name")},
14451 {"formatter_parser", (PyCFunction) formatter_parser,
14452 METH_O, PyDoc_STR("parse the argument as a format string")},
14453 {NULL, NULL}
14454};
14455
14456static struct PyModuleDef _string_module = {
14457 PyModuleDef_HEAD_INIT,
14458 "_string",
14459 PyDoc_STR("string helper module"),
14460 0,
14461 _string_methods,
14462 NULL,
14463 NULL,
14464 NULL,
14465 NULL
14466};
14467
14468PyMODINIT_FUNC
14469PyInit__string(void)
14470{
14471 return PyModule_Create(&_string_module);
14472}
14473
14474
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014475#ifdef __cplusplus
14476}
14477#endif