blob: 67336bf9f19b7201749c866f041f11ac7ad253cd [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
Benjamin Petersonbac79492012-01-14 13:34:47 -05001266 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001267 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001268 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001747 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1748 strlen(id->string),
1749 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750 if (!id->object)
1751 return NULL;
1752 PyUnicode_InternInPlace(&id->object);
1753 assert(!id->next);
1754 id->next = static_strings;
1755 static_strings = id;
1756 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001757 return id->object;
1758}
1759
1760void
1761_PyUnicode_ClearStaticStrings()
1762{
1763 _Py_Identifier *i;
1764 for (i = static_strings; i; i = i->next) {
1765 Py_DECREF(i->object);
1766 i->object = NULL;
1767 i->next = NULL;
1768 }
1769}
1770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771/* Internal function, don't check maximum character */
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001775{
Victor Stinner785938e2011-12-11 20:09:03 +01001776 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001777 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001778#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001781 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001782 }
Victor Stinner785938e2011-12-11 20:09:03 +01001783 unicode = PyUnicode_New(size, 127);
1784 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001785 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001786 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1787 assert(_PyUnicode_CheckConsistency(unicode, 1));
1788 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001789}
1790
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001791static Py_UCS4
1792kind_maxchar_limit(unsigned int kind)
1793{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001794 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001795 case PyUnicode_1BYTE_KIND:
1796 return 0x80;
1797 case PyUnicode_2BYTE_KIND:
1798 return 0x100;
1799 case PyUnicode_4BYTE_KIND:
1800 return 0x10000;
1801 default:
1802 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001803 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804 }
1805}
1806
Victor Stinner702c7342011-10-05 13:50:52 +02001807static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001808_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001811 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 if (size == 0) {
1814 Py_INCREF(unicode_empty);
1815 return unicode_empty;
1816 }
1817 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001818 if (size == 1)
1819 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
1825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001826 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001828}
1829
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830static PyObject*
1831_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832{
1833 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 if (size == 0) {
1837 Py_INCREF(unicode_empty);
1838 return unicode_empty;
1839 }
1840 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001841 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001842 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 else {
1851 _PyUnicode_CONVERT_BYTES(
1852 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 return res;
1856}
1857
Victor Stinnere57b1c02011-09-28 22:20:48 +02001858static PyObject*
1859_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860{
1861 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 if (size == 0) {
1865 Py_INCREF(unicode_empty);
1866 return unicode_empty;
1867 }
1868 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001869 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 return get_latin1_char((unsigned char)u[0]);
1871
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001873 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 if (!res)
1875 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001876 if (max_char < 256)
1877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1878 PyUnicode_1BYTE_DATA(res));
1879 else if (max_char < 0x10000)
1880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1881 PyUnicode_2BYTE_DATA(res));
1882 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001884 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 return res;
1886}
1887
1888PyObject*
1889PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1890{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001891 if (size < 0) {
1892 PyErr_SetString(PyExc_ValueError, "size must be positive");
1893 return NULL;
1894 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001895 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001897 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 PyErr_SetString(PyExc_SystemError, "invalid kind");
1904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906}
1907
Victor Stinner25a4b292011-10-06 12:31:55 +02001908/* Ensure that a string uses the most efficient storage, if it is not the
1909 case: create a new string with of the right kind. Write NULL into *p_unicode
1910 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001911static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001912unicode_adjust_maxchar(PyObject **p_unicode)
1913{
1914 PyObject *unicode, *copy;
1915 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001916 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001917 unsigned int kind;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921 assert(PyUnicode_IS_READY(unicode));
1922 if (PyUnicode_IS_ASCII(unicode))
1923 return;
1924
1925 len = PyUnicode_GET_LENGTH(unicode);
1926 kind = PyUnicode_KIND(unicode);
1927 if (kind == PyUnicode_1BYTE_KIND) {
1928 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + len);
1930 if (max_char >= 128)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else if (kind == PyUnicode_2BYTE_KIND) {
1934 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + len);
1936 if (max_char >= 256)
1937 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001938 }
1939 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs4lib_find_max_char(u, u + len);
1943 if (max_char >= 0x10000)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 copy = PyUnicode_New(len, max_char);
1947 copy_characters(copy, 0, unicode, 0, len);
1948 Py_DECREF(unicode);
1949 *p_unicode = copy;
1950}
1951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001953_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954{
Victor Stinner87af4f22011-11-21 23:03:47 +01001955 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001957
Victor Stinner034f6cf2011-09-30 02:26:44 +02001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05001962 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001963 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 length = PyUnicode_GET_LENGTH(unicode);
1966 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 if (!copy)
1968 return NULL;
1969 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1970
Victor Stinner87af4f22011-11-21 23:03:47 +01001971 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1972 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001975}
1976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978/* Widen Unicode objects to larger buffers. Don't write terminating null
1979 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
1981void*
1982_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1983{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001984 Py_ssize_t len;
1985 void *result;
1986 unsigned int skind;
1987
Benjamin Petersonbac79492012-01-14 13:34:47 -05001988 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02001989 return NULL;
1990
1991 len = PyUnicode_GET_LENGTH(s);
1992 skind = PyUnicode_KIND(s);
1993 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001994 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return NULL;
1996 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001997 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 case PyUnicode_2BYTE_KIND:
1999 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2000 if (!result)
2001 return PyErr_NoMemory();
2002 assert(skind == PyUnicode_1BYTE_KIND);
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS1, Py_UCS2,
2005 PyUnicode_1BYTE_DATA(s),
2006 PyUnicode_1BYTE_DATA(s) + len,
2007 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_4BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 if (skind == PyUnicode_2BYTE_KIND) {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS4,
2016 PyUnicode_2BYTE_DATA(s),
2017 PyUnicode_2BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 else {
2021 assert(skind == PyUnicode_1BYTE_KIND);
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS1, Py_UCS4,
2024 PyUnicode_1BYTE_DATA(s),
2025 PyUnicode_1BYTE_DATA(s) + len,
2026 result);
2027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 default:
2030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 }
Victor Stinner01698042011-10-04 00:04:26 +02002032 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return NULL;
2034}
2035
2036static Py_UCS4*
2037as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2038 int copy_null)
2039{
2040 int kind;
2041 void *data;
2042 Py_ssize_t len, targetlen;
2043 if (PyUnicode_READY(string) == -1)
2044 return NULL;
2045 kind = PyUnicode_KIND(string);
2046 data = PyUnicode_DATA(string);
2047 len = PyUnicode_GET_LENGTH(string);
2048 targetlen = len;
2049 if (copy_null)
2050 targetlen++;
2051 if (!target) {
2052 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2053 PyErr_NoMemory();
2054 return NULL;
2055 }
2056 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2057 if (!target) {
2058 PyErr_NoMemory();
2059 return NULL;
2060 }
2061 }
2062 else {
2063 if (targetsize < targetlen) {
2064 PyErr_Format(PyExc_SystemError,
2065 "string is longer than the buffer");
2066 if (copy_null && 0 < targetsize)
2067 target[0] = 0;
2068 return NULL;
2069 }
2070 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002071 if (kind == PyUnicode_1BYTE_KIND) {
2072 Py_UCS1 *start = (Py_UCS1 *) data;
2073 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002075 else if (kind == PyUnicode_2BYTE_KIND) {
2076 Py_UCS2 *start = (Py_UCS2 *) data;
2077 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2078 }
2079 else {
2080 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (copy_null)
2084 target[len] = 0;
2085 return target;
2086}
2087
2088Py_UCS4*
2089PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002092 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
2096 return as_ucs4(string, target, targetsize, copy_null);
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4Copy(PyObject *string)
2101{
2102 return as_ucs4(string, NULL, 0, 1);
2103}
2104
2105#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002106
Alexander Belopolsky40018472011-02-26 01:02:56 +00002107PyObject *
2108PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002111 if (size == 0) {
2112 Py_INCREF(unicode_empty);
2113 return unicode_empty;
2114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_BadInternalCall();
2116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == -1) {
2120 size = wcslen(w);
2121 }
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002127
Walter Dörwald346737f2007-05-31 10:44:43 +00002128static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002129makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2130 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 *fmt++ = '%';
2133 if (width) {
2134 if (zeropad)
2135 *fmt++ = '0';
2136 fmt += sprintf(fmt, "%d", width);
2137 }
2138 if (precision)
2139 fmt += sprintf(fmt, ".%d", precision);
2140 if (longflag)
2141 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142 else if (longlongflag) {
2143 /* longlongflag should only ever be nonzero on machines with
2144 HAVE_LONG_LONG defined */
2145#ifdef HAVE_LONG_LONG
2146 char *f = PY_FORMAT_LONG_LONG;
2147 while (*f)
2148 *fmt++ = *f++;
2149#else
2150 /* we shouldn't ever get here */
2151 assert(0);
2152 *fmt++ = 'l';
2153#endif
2154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 else if (size_tflag) {
2156 char *f = PY_FORMAT_SIZE_T;
2157 while (*f)
2158 *fmt++ = *f++;
2159 }
2160 *fmt++ = c;
2161 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002162}
2163
Victor Stinner96865452011-03-01 23:44:09 +00002164/* helper for PyUnicode_FromFormatV() */
2165
2166static const char*
2167parse_format_flags(const char *f,
2168 int *p_width, int *p_precision,
2169 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2170{
2171 int width, precision, longflag, longlongflag, size_tflag;
2172
2173 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2174 f++;
2175 width = 0;
2176 while (Py_ISDIGIT((unsigned)*f))
2177 width = (width*10) + *f++ - '0';
2178 precision = 0;
2179 if (*f == '.') {
2180 f++;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 precision = (precision*10) + *f++ - '0';
2183 if (*f == '%') {
2184 /* "%.3%s" => f points to "3" */
2185 f--;
2186 }
2187 }
2188 if (*f == '\0') {
2189 /* bogus format "%.1" => go backward, f points to "1" */
2190 f--;
2191 }
2192 if (p_width != NULL)
2193 *p_width = width;
2194 if (p_precision != NULL)
2195 *p_precision = precision;
2196
2197 /* Handle %ld, %lu, %lld and %llu. */
2198 longflag = 0;
2199 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002200 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002201
2202 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longflag = 1;
2205 ++f;
2206 }
2207#ifdef HAVE_LONG_LONG
2208 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longlongflag = 1;
2211 f += 2;
2212 }
2213#endif
2214 }
2215 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 size_tflag = 1;
2218 ++f;
2219 }
2220 if (p_longflag != NULL)
2221 *p_longflag = longflag;
2222 if (p_longlongflag != NULL)
2223 *p_longlongflag = longlongflag;
2224 if (p_size_tflag != NULL)
2225 *p_size_tflag = size_tflag;
2226 return f;
2227}
2228
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229/* maximum number of characters required for output of %ld. 21 characters
2230 allows for 64-bit integers (in decimal) and an optional sign. */
2231#define MAX_LONG_CHARS 21
2232/* maximum number of characters required for output of %lld.
2233 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2234 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2235#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormatV(const char *format, va_list vargs)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_list count;
2241 Py_ssize_t callcount = 0;
2242 PyObject **callresults = NULL;
2243 PyObject **callresult = NULL;
2244 Py_ssize_t n = 0;
2245 int width = 0;
2246 int precision = 0;
2247 int zeropad;
2248 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002249 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002251 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2253 Py_UCS4 argmaxchar;
2254 Py_ssize_t numbersize = 0;
2255 char *numberresults = NULL;
2256 char *numberresult = NULL;
2257 Py_ssize_t i;
2258 int kind;
2259 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002260
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002261 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002262 /* step 1: count the number of %S/%R/%A/%s format specifications
2263 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2264 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002266 * also estimate a upper bound for all the number formats in the string,
2267 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 for (f = format; *f; f++) {
2270 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002271 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2273 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2274 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2275 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278#ifdef HAVE_LONG_LONG
2279 if (longlongflag) {
2280 if (width < MAX_LONG_LONG_CHARS)
2281 width = MAX_LONG_LONG_CHARS;
2282 }
2283 else
2284#endif
2285 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2286 including sign. Decimal takes the most space. This
2287 isn't enough for octal. If a width is specified we
2288 need more (which we allocate later). */
2289 if (width < MAX_LONG_CHARS)
2290 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291
2292 /* account for the size + '\0' to separate numbers
2293 inside of the numberresults buffer */
2294 numbersize += (width + 1);
2295 }
2296 }
2297 else if ((unsigned char)*f > 127) {
2298 PyErr_Format(PyExc_ValueError,
2299 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2300 "string, got a non-ASCII byte: 0x%02x",
2301 (unsigned char)*f);
2302 return NULL;
2303 }
2304 }
2305 /* step 2: allocate memory for the results of
2306 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2307 if (callcount) {
2308 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2309 if (!callresults) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313 callresult = callresults;
2314 }
2315 /* step 2.5: allocate memory for the results of formating numbers */
2316 if (numbersize) {
2317 numberresults = PyObject_Malloc(numbersize);
2318 if (!numberresults) {
2319 PyErr_NoMemory();
2320 goto fail;
2321 }
2322 numberresult = numberresults;
2323 }
2324
2325 /* step 3: format numbers and figure out how large a buffer we need */
2326 for (f = format; *f; f++) {
2327 if (*f == '%') {
2328 const char* p;
2329 int longflag;
2330 int longlongflag;
2331 int size_tflag;
2332 int numprinted;
2333
2334 p = f;
2335 zeropad = (f[1] == '0');
2336 f = parse_format_flags(f, &width, &precision,
2337 &longflag, &longlongflag, &size_tflag);
2338 switch (*f) {
2339 case 'c':
2340 {
2341 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002342 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 n++;
2344 break;
2345 }
2346 case '%':
2347 n++;
2348 break;
2349 case 'i':
2350 case 'd':
2351 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2352 width, precision, *f);
2353 if (longflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, long));
2356#ifdef HAVE_LONG_LONG
2357 else if (longlongflag)
2358 numprinted = sprintf(numberresult, fmt,
2359 va_arg(count, PY_LONG_LONG));
2360#endif
2361 else if (size_tflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, Py_ssize_t));
2364 else
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, int));
2367 n += numprinted;
2368 /* advance by +1 to skip over the '\0' */
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
2374 break;
2375 case 'u':
2376 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2377 width, precision, 'u');
2378 if (longflag)
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, unsigned long));
2381#ifdef HAVE_LONG_LONG
2382 else if (longlongflag)
2383 numprinted = sprintf(numberresult, fmt,
2384 va_arg(count, unsigned PY_LONG_LONG));
2385#endif
2386 else if (size_tflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, size_t));
2389 else
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned int));
2392 n += numprinted;
2393 numberresult += (numprinted + 1);
2394 assert(*(numberresult - 1) == '\0');
2395 assert(*(numberresult - 2) != '\0');
2396 assert(numprinted >= 0);
2397 assert(numberresult <= numberresults + numbersize);
2398 break;
2399 case 'x':
2400 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2401 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2402 n += numprinted;
2403 numberresult += (numprinted + 1);
2404 assert(*(numberresult - 1) == '\0');
2405 assert(*(numberresult - 2) != '\0');
2406 assert(numprinted >= 0);
2407 assert(numberresult <= numberresults + numbersize);
2408 break;
2409 case 'p':
2410 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2411 /* %p is ill-defined: ensure leading 0x. */
2412 if (numberresult[1] == 'X')
2413 numberresult[1] = 'x';
2414 else if (numberresult[1] != 'x') {
2415 memmove(numberresult + 2, numberresult,
2416 strlen(numberresult) + 1);
2417 numberresult[0] = '0';
2418 numberresult[1] = 'x';
2419 numprinted += 2;
2420 }
2421 n += numprinted;
2422 numberresult += (numprinted + 1);
2423 assert(*(numberresult - 1) == '\0');
2424 assert(*(numberresult - 2) != '\0');
2425 assert(numprinted >= 0);
2426 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 break;
2428 case 's':
2429 {
2430 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002431 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002432 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002433 if (!str)
2434 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 /* since PyUnicode_DecodeUTF8 returns already flexible
2436 unicode objects, there is no need to call ready on them */
2437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002440 /* Remember the str and switch to the next slot */
2441 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 }
2444 case 'U':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002447 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (PyUnicode_READY(obj) == -1)
2449 goto fail;
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'V':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002459 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (PyUnicode_READY(obj) == -1)
2464 goto fail;
2465 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002466 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 *callresult++ = NULL;
2469 }
2470 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002471 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 if (!str_obj)
2473 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002474 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002475 Py_DECREF(str_obj);
2476 goto fail;
2477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = str_obj;
2482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 break;
2484 }
2485 case 'S':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *str;
2489 assert(obj);
2490 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002491 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002493 if (PyUnicode_READY(str) == -1) {
2494 Py_DECREF(str);
2495 goto fail;
2496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002498 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 /* Remember the str and switch to the next slot */
2501 *callresult++ = str;
2502 break;
2503 }
2504 case 'R':
2505 {
2506 PyObject *obj = va_arg(count, PyObject *);
2507 PyObject *repr;
2508 assert(obj);
2509 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002510 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002512 if (PyUnicode_READY(repr) == -1) {
2513 Py_DECREF(repr);
2514 goto fail;
2515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002517 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 /* Remember the repr and switch to the next slot */
2520 *callresult++ = repr;
2521 break;
2522 }
2523 case 'A':
2524 {
2525 PyObject *obj = va_arg(count, PyObject *);
2526 PyObject *ascii;
2527 assert(obj);
2528 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002529 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002531 if (PyUnicode_READY(ascii) == -1) {
2532 Py_DECREF(ascii);
2533 goto fail;
2534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002535 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002536 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 /* Remember the repr and switch to the next slot */
2539 *callresult++ = ascii;
2540 break;
2541 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 default:
2543 /* if we stumble upon an unknown
2544 formatting code, copy the rest of
2545 the format string to the output
2546 string. (we cannot just skip the
2547 code, since there's no way to know
2548 what's in the argument list) */
2549 n += strlen(p);
2550 goto expand;
2551 }
2552 } else
2553 n++;
2554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002555 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 we don't have to resize the string.
2559 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002560 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 if (!string)
2562 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 kind = PyUnicode_KIND(string);
2564 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002570 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002571
2572 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2574 /* checking for == because the last argument could be a empty
2575 string, which causes i to point to end, the assert at the end of
2576 the loop */
2577 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002578
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 switch (*f) {
2580 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002581 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 const int ordinal = va_arg(vargs, int);
2583 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002585 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002586 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 case 'p':
2591 /* unused, since we already have the result */
2592 if (*f == 'p')
2593 (void) va_arg(vargs, void *);
2594 else
2595 (void) va_arg(vargs, int);
2596 /* extract the result from numberresults and append. */
2597 for (; *numberresult; ++i, ++numberresult)
2598 PyUnicode_WRITE(kind, data, i, *numberresult);
2599 /* skip over the separating '\0' */
2600 assert(*numberresult == '\0');
2601 numberresult++;
2602 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 break;
2604 case 's':
2605 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002606 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002608 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 size = PyUnicode_GET_LENGTH(*callresult);
2610 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002611 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002613 /* We're done with the unicode()/repr() => forget it */
2614 Py_DECREF(*callresult);
2615 /* switch to next unicode()/repr() result */
2616 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002617 break;
2618 }
2619 case 'U':
2620 {
2621 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 Py_ssize_t size;
2623 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2624 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 break;
2628 }
2629 case 'V':
2630 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 size = PyUnicode_GET_LENGTH(obj);
2636 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002637 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 size = PyUnicode_GET_LENGTH(*callresult);
2641 assert(PyUnicode_KIND(*callresult) <=
2642 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002643 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002645 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002647 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 break;
2649 }
2650 case 'S':
2651 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002652 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002654 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 /* unused, since we already have the result */
2656 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002658 copy_characters(string, i, *callresult, 0, size);
2659 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 /* We're done with the unicode()/repr() => forget it */
2661 Py_DECREF(*callresult);
2662 /* switch to next unicode()/repr() result */
2663 ++callresult;
2664 break;
2665 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 break;
2669 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 for (; *p; ++p, ++i)
2671 PyUnicode_WRITE(kind, data, i, *p);
2672 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 goto end;
2674 }
Victor Stinner1205f272010-09-11 00:54:47 +00002675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 else {
2677 assert(i < PyUnicode_GET_LENGTH(string));
2678 PyUnicode_WRITE(kind, data, i++, *f);
2679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002682
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 if (callresults)
2685 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002688 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 if (callresults) {
2691 PyObject **callresult2 = callresults;
2692 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002693 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 ++callresult2;
2695 }
2696 PyObject_Free(callresults);
2697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 if (numberresults)
2699 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701}
2702
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703PyObject *
2704PyUnicode_FromFormat(const char *format, ...)
2705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 PyObject* ret;
2707 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
2709#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002711#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 ret = PyUnicode_FromFormatV(format, vargs);
2715 va_end(vargs);
2716 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717}
2718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719#ifdef HAVE_WCHAR_H
2720
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2722 convert a Unicode object to a wide character string.
2723
Victor Stinnerd88d9832011-09-06 02:00:05 +02002724 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725 character) required to convert the unicode object. Ignore size argument.
2726
Victor Stinnerd88d9832011-09-06 02:00:05 +02002727 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002728 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002729 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002731unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 wchar_t *w,
2733 Py_ssize_t size)
2734{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002735 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 const wchar_t *wstr;
2737
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002738 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 if (wstr == NULL)
2740 return -1;
2741
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743 if (size > res)
2744 size = res + 1;
2745 else
2746 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 return res;
2749 }
2750 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002752}
2753
2754Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002755PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 wchar_t *w,
2757 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758{
2759 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 PyErr_BadInternalCall();
2761 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002763 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764}
2765
Victor Stinner137c34c2010-09-29 10:25:54 +00002766wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002767PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002768 Py_ssize_t *size)
2769{
2770 wchar_t* buffer;
2771 Py_ssize_t buflen;
2772
2773 if (unicode == NULL) {
2774 PyErr_BadInternalCall();
2775 return NULL;
2776 }
2777
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002778 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 if (buflen == -1)
2780 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 PyErr_NoMemory();
2783 return NULL;
2784 }
2785
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2787 if (buffer == NULL) {
2788 PyErr_NoMemory();
2789 return NULL;
2790 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002791 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 if (buflen == -1)
2793 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794 if (size != NULL)
2795 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002796 return buffer;
2797}
2798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800
Alexander Belopolsky40018472011-02-26 01:02:56 +00002801PyObject *
2802PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002805 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 PyErr_SetString(PyExc_ValueError,
2807 "chr() arg not in range(0x110000)");
2808 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (ordinal < 256)
2812 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 v = PyUnicode_New(1, ordinal);
2815 if (v == NULL)
2816 return NULL;
2817 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002818 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002820}
2821
Alexander Belopolsky40018472011-02-26 01:02:56 +00002822PyObject *
2823PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002828 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002829 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 Py_INCREF(obj);
2831 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002832 }
2833 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 /* For a Unicode subtype that's not a Unicode object,
2835 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002836 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002837 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002838 PyErr_Format(PyExc_TypeError,
2839 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002840 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002841 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002842}
2843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002846 const char *encoding,
2847 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002848{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002849 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 PyErr_BadInternalCall();
2854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002856
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 /* Decoding bytes objects is the most common case and should be fast */
2858 if (PyBytes_Check(obj)) {
2859 if (PyBytes_GET_SIZE(obj) == 0) {
2860 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002861 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 }
2863 else {
2864 v = PyUnicode_Decode(
2865 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2866 encoding, errors);
2867 }
2868 return v;
2869 }
2870
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002871 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002872 PyErr_SetString(PyExc_TypeError,
2873 "decoding str is not supported");
2874 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002875 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002876
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2878 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2879 PyErr_Format(PyExc_TypeError,
2880 "coercing to str: need bytes, bytearray "
2881 "or buffer-like object, %.80s found",
2882 Py_TYPE(obj)->tp_name);
2883 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002884 }
Tim Petersced69f82003-09-16 20:30:58 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002887 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002888 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 }
Tim Petersced69f82003-09-16 20:30:58 +00002890 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002892
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002894 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895}
2896
Victor Stinner600d3be2010-06-10 12:00:55 +00002897/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002898 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2899 1 on success. */
2900static int
2901normalize_encoding(const char *encoding,
2902 char *lower,
2903 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002905 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002906 char *l;
2907 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002909 if (encoding == NULL) {
2910 strcpy(lower, "utf-8");
2911 return 1;
2912 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002913 e = encoding;
2914 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002915 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002916 while (*e) {
2917 if (l == l_end)
2918 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002919 if (Py_ISUPPER(*e)) {
2920 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002921 }
2922 else if (*e == '_') {
2923 *l++ = '-';
2924 e++;
2925 }
2926 else {
2927 *l++ = *e++;
2928 }
2929 }
2930 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002931 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002932}
2933
Alexander Belopolsky40018472011-02-26 01:02:56 +00002934PyObject *
2935PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002936 Py_ssize_t size,
2937 const char *encoding,
2938 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002939{
2940 PyObject *buffer = NULL, *unicode;
2941 Py_buffer info;
2942 char lower[11]; /* Enough for any encoding shortcut */
2943
Fred Drakee4315f52000-05-09 19:53:39 +00002944 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002945 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002946 if ((strcmp(lower, "utf-8") == 0) ||
2947 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002948 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002949 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002950 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002951 (strcmp(lower, "iso-8859-1") == 0))
2952 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002953#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002954 else if (strcmp(lower, "mbcs") == 0)
2955 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002956#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002957 else if (strcmp(lower, "ascii") == 0)
2958 return PyUnicode_DecodeASCII(s, size, errors);
2959 else if (strcmp(lower, "utf-16") == 0)
2960 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2961 else if (strcmp(lower, "utf-32") == 0)
2962 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964
2965 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002966 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002967 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002968 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002969 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 if (buffer == NULL)
2971 goto onError;
2972 unicode = PyCodec_Decode(buffer, encoding, errors);
2973 if (unicode == NULL)
2974 goto onError;
2975 if (!PyUnicode_Check(unicode)) {
2976 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002977 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002978 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_DECREF(unicode);
2980 goto onError;
2981 }
2982 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002983 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002984
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 Py_XDECREF(buffer);
2987 return NULL;
2988}
2989
Alexander Belopolsky40018472011-02-26 01:02:56 +00002990PyObject *
2991PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002992 const char *encoding,
2993 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002994{
2995 PyObject *v;
2996
2997 if (!PyUnicode_Check(unicode)) {
2998 PyErr_BadArgument();
2999 goto onError;
3000 }
3001
3002 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003004
3005 /* Decode via the codec registry */
3006 v = PyCodec_Decode(unicode, encoding, errors);
3007 if (v == NULL)
3008 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003009 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003010
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 const char *encoding,
3018 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003019{
3020 PyObject *v;
3021
3022 if (!PyUnicode_Check(unicode)) {
3023 PyErr_BadArgument();
3024 goto onError;
3025 }
3026
3027 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003029
3030 /* Decode via the codec registry */
3031 v = PyCodec_Decode(unicode, encoding, errors);
3032 if (v == NULL)
3033 goto onError;
3034 if (!PyUnicode_Check(v)) {
3035 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003036 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037 Py_TYPE(v)->tp_name);
3038 Py_DECREF(v);
3039 goto onError;
3040 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003041 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003042
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044 return NULL;
3045}
3046
Alexander Belopolsky40018472011-02-26 01:02:56 +00003047PyObject *
3048PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003049 Py_ssize_t size,
3050 const char *encoding,
3051 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052{
3053 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003054
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 unicode = PyUnicode_FromUnicode(s, size);
3056 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3059 Py_DECREF(unicode);
3060 return v;
3061}
3062
Alexander Belopolsky40018472011-02-26 01:02:56 +00003063PyObject *
3064PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003065 const char *encoding,
3066 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003067{
3068 PyObject *v;
3069
3070 if (!PyUnicode_Check(unicode)) {
3071 PyErr_BadArgument();
3072 goto onError;
3073 }
3074
3075 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
3078 /* Encode via the codec registry */
3079 v = PyCodec_Encode(unicode, encoding, errors);
3080 if (v == NULL)
3081 goto onError;
3082 return v;
3083
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003085 return NULL;
3086}
3087
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003088static size_t
3089wcstombs_errorpos(const wchar_t *wstr)
3090{
3091 size_t len;
3092#if SIZEOF_WCHAR_T == 2
3093 wchar_t buf[3];
3094#else
3095 wchar_t buf[2];
3096#endif
3097 char outbuf[MB_LEN_MAX];
3098 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003099
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003100#if SIZEOF_WCHAR_T == 2
3101 buf[2] = 0;
3102#else
3103 buf[1] = 0;
3104#endif
3105 start = wstr;
3106 while (*wstr != L'\0')
3107 {
3108 previous = wstr;
3109#if SIZEOF_WCHAR_T == 2
3110 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3111 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3112 {
3113 buf[0] = wstr[0];
3114 buf[1] = wstr[1];
3115 wstr += 2;
3116 }
3117 else {
3118 buf[0] = *wstr;
3119 buf[1] = 0;
3120 wstr++;
3121 }
3122#else
3123 buf[0] = *wstr;
3124 wstr++;
3125#endif
3126 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003127 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003128 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003129 }
3130
3131 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003132 return 0;
3133}
3134
Victor Stinner1b579672011-12-17 05:47:23 +01003135static int
3136locale_error_handler(const char *errors, int *surrogateescape)
3137{
3138 if (errors == NULL) {
3139 *surrogateescape = 0;
3140 return 0;
3141 }
3142
3143 if (strcmp(errors, "strict") == 0) {
3144 *surrogateescape = 0;
3145 return 0;
3146 }
3147 if (strcmp(errors, "surrogateescape") == 0) {
3148 *surrogateescape = 1;
3149 return 0;
3150 }
3151 PyErr_Format(PyExc_ValueError,
3152 "only 'strict' and 'surrogateescape' error handlers "
3153 "are supported, not '%s'",
3154 errors);
3155 return -1;
3156}
3157
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003158PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003159PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160{
3161 Py_ssize_t wlen, wlen2;
3162 wchar_t *wstr;
3163 PyObject *bytes = NULL;
3164 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003165 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003166 PyObject *exc;
3167 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003168 int surrogateescape;
3169
3170 if (locale_error_handler(errors, &surrogateescape) < 0)
3171 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172
3173 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3174 if (wstr == NULL)
3175 return NULL;
3176
3177 wlen2 = wcslen(wstr);
3178 if (wlen2 != wlen) {
3179 PyMem_Free(wstr);
3180 PyErr_SetString(PyExc_TypeError, "embedded null character");
3181 return NULL;
3182 }
3183
3184 if (surrogateescape) {
3185 /* locale encoding with surrogateescape */
3186 char *str;
3187
3188 str = _Py_wchar2char(wstr, &error_pos);
3189 if (str == NULL) {
3190 if (error_pos == (size_t)-1) {
3191 PyErr_NoMemory();
3192 PyMem_Free(wstr);
3193 return NULL;
3194 }
3195 else {
3196 goto encode_error;
3197 }
3198 }
3199 PyMem_Free(wstr);
3200
3201 bytes = PyBytes_FromString(str);
3202 PyMem_Free(str);
3203 }
3204 else {
3205 size_t len, len2;
3206
3207 len = wcstombs(NULL, wstr, 0);
3208 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003209 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003210 goto encode_error;
3211 }
3212
3213 bytes = PyBytes_FromStringAndSize(NULL, len);
3214 if (bytes == NULL) {
3215 PyMem_Free(wstr);
3216 return NULL;
3217 }
3218
3219 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3220 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003221 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003222 goto encode_error;
3223 }
3224 PyMem_Free(wstr);
3225 }
3226 return bytes;
3227
3228encode_error:
3229 errmsg = strerror(errno);
3230 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003231
3232 if (error_pos == (size_t)-1)
3233 error_pos = wcstombs_errorpos(wstr);
3234
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235 PyMem_Free(wstr);
3236 Py_XDECREF(bytes);
3237
Victor Stinner2f197072011-12-17 07:08:30 +01003238 if (errmsg != NULL) {
3239 size_t errlen;
3240 wstr = _Py_char2wchar(errmsg, &errlen);
3241 if (wstr != NULL) {
3242 reason = PyUnicode_FromWideChar(wstr, errlen);
3243 PyMem_Free(wstr);
3244 } else
3245 errmsg = NULL;
3246 }
3247 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003248 reason = PyUnicode_FromString(
3249 "wcstombs() encountered an unencodable "
3250 "wide character");
3251 if (reason == NULL)
3252 return NULL;
3253
3254 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3255 "locale", unicode,
3256 (Py_ssize_t)error_pos,
3257 (Py_ssize_t)(error_pos+1),
3258 reason);
3259 Py_DECREF(reason);
3260 if (exc != NULL) {
3261 PyCodec_StrictErrors(exc);
3262 Py_XDECREF(exc);
3263 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003264 return NULL;
3265}
3266
Victor Stinnerad158722010-10-27 00:25:46 +00003267PyObject *
3268PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003269{
Victor Stinner99b95382011-07-04 14:23:54 +02003270#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003271 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003272#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003273 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003274#else
Victor Stinner793b5312011-04-27 00:24:21 +02003275 PyInterpreterState *interp = PyThreadState_GET()->interp;
3276 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3277 cannot use it to encode and decode filenames before it is loaded. Load
3278 the Python codec requires to encode at least its own filename. Use the C
3279 version of the locale codec until the codec registry is initialized and
3280 the Python codec is loaded.
3281
3282 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3283 cannot only rely on it: check also interp->fscodec_initialized for
3284 subinterpreters. */
3285 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003286 return PyUnicode_AsEncodedString(unicode,
3287 Py_FileSystemDefaultEncoding,
3288 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003289 }
3290 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003291 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003292 }
Victor Stinnerad158722010-10-27 00:25:46 +00003293#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003294}
3295
Alexander Belopolsky40018472011-02-26 01:02:56 +00003296PyObject *
3297PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003298 const char *encoding,
3299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300{
3301 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003302 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003303
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 if (!PyUnicode_Check(unicode)) {
3305 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 }
Fred Drakee4315f52000-05-09 19:53:39 +00003308
Fred Drakee4315f52000-05-09 19:53:39 +00003309 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003310 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003311 if ((strcmp(lower, "utf-8") == 0) ||
3312 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003313 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003314 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003315 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003316 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003318 }
Victor Stinner37296e82010-06-10 13:36:23 +00003319 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003320 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003321 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003323#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003324 else if (strcmp(lower, "mbcs") == 0)
3325 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003326#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003327 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003328 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330
3331 /* Encode via the codec registry */
3332 v = PyCodec_Encode(unicode, encoding, errors);
3333 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003334 return NULL;
3335
3336 /* The normal path */
3337 if (PyBytes_Check(v))
3338 return v;
3339
3340 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003341 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003342 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003343 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003344
3345 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3346 "encoder %s returned bytearray instead of bytes",
3347 encoding);
3348 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003349 Py_DECREF(v);
3350 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003351 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003352
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003353 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3354 Py_DECREF(v);
3355 return b;
3356 }
3357
3358 PyErr_Format(PyExc_TypeError,
3359 "encoder did not return a bytes object (type=%.400s)",
3360 Py_TYPE(v)->tp_name);
3361 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003362 return NULL;
3363}
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 const char *encoding,
3368 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003369{
3370 PyObject *v;
3371
3372 if (!PyUnicode_Check(unicode)) {
3373 PyErr_BadArgument();
3374 goto onError;
3375 }
3376
3377 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003379
3380 /* Encode via the codec registry */
3381 v = PyCodec_Encode(unicode, encoding, errors);
3382 if (v == NULL)
3383 goto onError;
3384 if (!PyUnicode_Check(v)) {
3385 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003386 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003387 Py_TYPE(v)->tp_name);
3388 Py_DECREF(v);
3389 goto onError;
3390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003392
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 return NULL;
3395}
3396
Victor Stinner2f197072011-12-17 07:08:30 +01003397static size_t
3398mbstowcs_errorpos(const char *str, size_t len)
3399{
3400#ifdef HAVE_MBRTOWC
3401 const char *start = str;
3402 mbstate_t mbs;
3403 size_t converted;
3404 wchar_t ch;
3405
3406 memset(&mbs, 0, sizeof mbs);
3407 while (len)
3408 {
3409 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3410 if (converted == 0)
3411 /* Reached end of string */
3412 break;
3413 if (converted == (size_t)-1 || converted == (size_t)-2) {
3414 /* Conversion error or incomplete character */
3415 return str - start;
3416 }
3417 else {
3418 str += converted;
3419 len -= converted;
3420 }
3421 }
3422 /* failed to find the undecodable byte sequence */
3423 return 0;
3424#endif
3425 return 0;
3426}
3427
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003428PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003429PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003430 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003431{
3432 wchar_t smallbuf[256];
3433 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3434 wchar_t *wstr;
3435 size_t wlen, wlen2;
3436 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003437 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003438 size_t error_pos;
3439 char *errmsg;
3440 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003441
3442 if (locale_error_handler(errors, &surrogateescape) < 0)
3443 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003444
3445 if (str[len] != '\0' || len != strlen(str)) {
3446 PyErr_SetString(PyExc_TypeError, "embedded null character");
3447 return NULL;
3448 }
3449
3450 if (surrogateescape)
3451 {
3452 wstr = _Py_char2wchar(str, &wlen);
3453 if (wstr == NULL) {
3454 if (wlen == (size_t)-1)
3455 PyErr_NoMemory();
3456 else
3457 PyErr_SetFromErrno(PyExc_OSError);
3458 return NULL;
3459 }
3460
3461 unicode = PyUnicode_FromWideChar(wstr, wlen);
3462 PyMem_Free(wstr);
3463 }
3464 else {
3465#ifndef HAVE_BROKEN_MBSTOWCS
3466 wlen = mbstowcs(NULL, str, 0);
3467#else
3468 wlen = len;
3469#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003470 if (wlen == (size_t)-1)
3471 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003472 if (wlen+1 <= smallbuf_len) {
3473 wstr = smallbuf;
3474 }
3475 else {
3476 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3477 return PyErr_NoMemory();
3478
3479 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3480 if (!wstr)
3481 return PyErr_NoMemory();
3482 }
3483
3484 /* This shouldn't fail now */
3485 wlen2 = mbstowcs(wstr, str, wlen+1);
3486 if (wlen2 == (size_t)-1) {
3487 if (wstr != smallbuf)
3488 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003489 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003490 }
3491#ifdef HAVE_BROKEN_MBSTOWCS
3492 assert(wlen2 == wlen);
3493#endif
3494 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3495 if (wstr != smallbuf)
3496 PyMem_Free(wstr);
3497 }
3498 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003499
3500decode_error:
3501 errmsg = strerror(errno);
3502 assert(errmsg != NULL);
3503
3504 error_pos = mbstowcs_errorpos(str, len);
3505 if (errmsg != NULL) {
3506 size_t errlen;
3507 wstr = _Py_char2wchar(errmsg, &errlen);
3508 if (wstr != NULL) {
3509 reason = PyUnicode_FromWideChar(wstr, errlen);
3510 PyMem_Free(wstr);
3511 } else
3512 errmsg = NULL;
3513 }
3514 if (errmsg == NULL)
3515 reason = PyUnicode_FromString(
3516 "mbstowcs() encountered an invalid multibyte sequence");
3517 if (reason == NULL)
3518 return NULL;
3519
3520 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3521 "locale", str, len,
3522 (Py_ssize_t)error_pos,
3523 (Py_ssize_t)(error_pos+1),
3524 reason);
3525 Py_DECREF(reason);
3526 if (exc != NULL) {
3527 PyCodec_StrictErrors(exc);
3528 Py_XDECREF(exc);
3529 }
3530 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531}
3532
3533PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003534PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003535{
3536 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003537 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003538}
3539
3540
3541PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003542PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003543 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003544 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3545}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003546
Christian Heimes5894ba72007-11-04 11:43:14 +00003547PyObject*
3548PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3549{
Victor Stinner99b95382011-07-04 14:23:54 +02003550#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003551 return PyUnicode_DecodeMBCS(s, size, NULL);
3552#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003553 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003554#else
Victor Stinner793b5312011-04-27 00:24:21 +02003555 PyInterpreterState *interp = PyThreadState_GET()->interp;
3556 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3557 cannot use it to encode and decode filenames before it is loaded. Load
3558 the Python codec requires to encode at least its own filename. Use the C
3559 version of the locale codec until the codec registry is initialized and
3560 the Python codec is loaded.
3561
3562 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3563 cannot only rely on it: check also interp->fscodec_initialized for
3564 subinterpreters. */
3565 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003566 return PyUnicode_Decode(s, size,
3567 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003568 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003569 }
3570 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003571 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572 }
Victor Stinnerad158722010-10-27 00:25:46 +00003573#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003574}
3575
Martin v. Löwis011e8422009-05-05 04:43:17 +00003576
3577int
Antoine Pitrou13348842012-01-29 18:36:34 +01003578_PyUnicode_HasNULChars(PyObject* s)
3579{
3580 static PyObject *nul = NULL;
3581
3582 if (nul == NULL)
3583 nul = PyUnicode_FromStringAndSize("\0", 1);
3584 if (nul == NULL)
3585 return -1;
3586 return PyUnicode_Contains(s, nul);
3587}
3588
3589
3590int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003591PyUnicode_FSConverter(PyObject* arg, void* addr)
3592{
3593 PyObject *output = NULL;
3594 Py_ssize_t size;
3595 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003596 if (arg == NULL) {
3597 Py_DECREF(*(PyObject**)addr);
3598 return 1;
3599 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003600 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003601 output = arg;
3602 Py_INCREF(output);
3603 }
3604 else {
3605 arg = PyUnicode_FromObject(arg);
3606 if (!arg)
3607 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003608 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003609 Py_DECREF(arg);
3610 if (!output)
3611 return 0;
3612 if (!PyBytes_Check(output)) {
3613 Py_DECREF(output);
3614 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3615 return 0;
3616 }
3617 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003618 size = PyBytes_GET_SIZE(output);
3619 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003621 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003622 Py_DECREF(output);
3623 return 0;
3624 }
3625 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003626 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003627}
3628
3629
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003630int
3631PyUnicode_FSDecoder(PyObject* arg, void* addr)
3632{
3633 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003634 if (arg == NULL) {
3635 Py_DECREF(*(PyObject**)addr);
3636 return 1;
3637 }
3638 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003639 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003640 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003641 output = arg;
3642 Py_INCREF(output);
3643 }
3644 else {
3645 arg = PyBytes_FromObject(arg);
3646 if (!arg)
3647 return 0;
3648 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3649 PyBytes_GET_SIZE(arg));
3650 Py_DECREF(arg);
3651 if (!output)
3652 return 0;
3653 if (!PyUnicode_Check(output)) {
3654 Py_DECREF(output);
3655 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3656 return 0;
3657 }
3658 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003659 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003660 Py_DECREF(output);
3661 return 0;
3662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003664 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003665 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3666 Py_DECREF(output);
3667 return 0;
3668 }
3669 *(PyObject**)addr = output;
3670 return Py_CLEANUP_SUPPORTED;
3671}
3672
3673
Martin v. Löwis5b222132007-06-10 09:51:05 +00003674char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003675PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003676{
Christian Heimesf3863112007-11-22 07:46:41 +00003677 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003678
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003679 if (!PyUnicode_Check(unicode)) {
3680 PyErr_BadArgument();
3681 return NULL;
3682 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003683 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003684 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003685
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003686 if (PyUnicode_UTF8(unicode) == NULL) {
3687 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003688 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3689 if (bytes == NULL)
3690 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003691 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3692 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003693 Py_DECREF(bytes);
3694 return NULL;
3695 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003696 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3697 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3698 PyBytes_AS_STRING(bytes),
3699 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003700 Py_DECREF(bytes);
3701 }
3702
3703 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003704 *psize = PyUnicode_UTF8_LENGTH(unicode);
3705 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003706}
3707
3708char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3712}
3713
3714#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003715static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716#endif
3717
3718
3719Py_UNICODE *
3720PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 const unsigned char *one_byte;
3723#if SIZEOF_WCHAR_T == 4
3724 const Py_UCS2 *two_bytes;
3725#else
3726 const Py_UCS4 *four_bytes;
3727 const Py_UCS4 *ucs4_end;
3728 Py_ssize_t num_surrogates;
3729#endif
3730 wchar_t *w;
3731 wchar_t *wchar_end;
3732
3733 if (!PyUnicode_Check(unicode)) {
3734 PyErr_BadArgument();
3735 return NULL;
3736 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003737 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003739 assert(_PyUnicode_KIND(unicode) != 0);
3740 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741
3742#ifdef Py_DEBUG
3743 ++unicode_as_unicode_calls;
3744#endif
3745
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003746 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003748 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3749 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003750 num_surrogates = 0;
3751
3752 for (; four_bytes < ucs4_end; ++four_bytes) {
3753 if (*four_bytes > 0xFFFF)
3754 ++num_surrogates;
3755 }
3756
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3758 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3759 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 PyErr_NoMemory();
3761 return NULL;
3762 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003763 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 w = _PyUnicode_WSTR(unicode);
3766 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3767 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3769 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003770 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003772 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3773 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774 }
3775 else
3776 *w = *four_bytes;
3777
3778 if (w > wchar_end) {
3779 assert(0 && "Miscalculated string end");
3780 }
3781 }
3782 *w = 0;
3783#else
3784 /* sizeof(wchar_t) == 4 */
3785 Py_FatalError("Impossible unicode object state, wstr and str "
3786 "should share memory already.");
3787 return NULL;
3788#endif
3789 }
3790 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003791 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3792 (_PyUnicode_LENGTH(unicode) + 1));
3793 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794 PyErr_NoMemory();
3795 return NULL;
3796 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003797 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3798 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3799 w = _PyUnicode_WSTR(unicode);
3800 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003802 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3803 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003804 for (; w < wchar_end; ++one_byte, ++w)
3805 *w = *one_byte;
3806 /* null-terminate the wstr */
3807 *w = 0;
3808 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003809 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812 for (; w < wchar_end; ++two_bytes, ++w)
3813 *w = *two_bytes;
3814 /* null-terminate the wstr */
3815 *w = 0;
3816#else
3817 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 PyObject_FREE(_PyUnicode_WSTR(unicode));
3819 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 Py_FatalError("Impossible unicode object state, wstr "
3821 "and str should share memory already.");
3822 return NULL;
3823#endif
3824 }
3825 else {
3826 assert(0 && "This should never happen.");
3827 }
3828 }
3829 }
3830 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003831 *size = PyUnicode_WSTR_LENGTH(unicode);
3832 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003833}
3834
Alexander Belopolsky40018472011-02-26 01:02:56 +00003835Py_UNICODE *
3836PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839}
3840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841
Alexander Belopolsky40018472011-02-26 01:02:56 +00003842Py_ssize_t
3843PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844{
3845 if (!PyUnicode_Check(unicode)) {
3846 PyErr_BadArgument();
3847 goto onError;
3848 }
3849 return PyUnicode_GET_SIZE(unicode);
3850
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 return -1;
3853}
3854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855Py_ssize_t
3856PyUnicode_GetLength(PyObject *unicode)
3857{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003858 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 PyErr_BadArgument();
3860 return -1;
3861 }
3862
3863 return PyUnicode_GET_LENGTH(unicode);
3864}
3865
3866Py_UCS4
3867PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3868{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003869 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3870 PyErr_BadArgument();
3871 return (Py_UCS4)-1;
3872 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003873 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003874 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 return (Py_UCS4)-1;
3876 }
3877 return PyUnicode_READ_CHAR(unicode, index);
3878}
3879
3880int
3881PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3882{
3883 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003884 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 return -1;
3886 }
Victor Stinner488fa492011-12-12 00:01:39 +01003887 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003888 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003889 PyErr_SetString(PyExc_IndexError, "string index out of range");
3890 return -1;
3891 }
Victor Stinner488fa492011-12-12 00:01:39 +01003892 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003893 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3895 index, ch);
3896 return 0;
3897}
3898
Alexander Belopolsky40018472011-02-26 01:02:56 +00003899const char *
3900PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003901{
Victor Stinner42cb4622010-09-01 19:39:01 +00003902 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003903}
3904
Victor Stinner554f3f02010-06-16 23:33:54 +00003905/* create or adjust a UnicodeDecodeError */
3906static void
3907make_decode_exception(PyObject **exceptionObject,
3908 const char *encoding,
3909 const char *input, Py_ssize_t length,
3910 Py_ssize_t startpos, Py_ssize_t endpos,
3911 const char *reason)
3912{
3913 if (*exceptionObject == NULL) {
3914 *exceptionObject = PyUnicodeDecodeError_Create(
3915 encoding, input, length, startpos, endpos, reason);
3916 }
3917 else {
3918 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3919 goto onError;
3920 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3921 goto onError;
3922 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3923 goto onError;
3924 }
3925 return;
3926
3927onError:
3928 Py_DECREF(*exceptionObject);
3929 *exceptionObject = NULL;
3930}
3931
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932/* error handling callback helper:
3933 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003934 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 and adjust various state variables.
3936 return 0 on success, -1 on error
3937*/
3938
Alexander Belopolsky40018472011-02-26 01:02:56 +00003939static int
3940unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003941 const char *encoding, const char *reason,
3942 const char **input, const char **inend, Py_ssize_t *startinpos,
3943 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003944 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003946 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947
3948 PyObject *restuple = NULL;
3949 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003950 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003951 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952 Py_ssize_t requiredsize;
3953 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003954 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 int res = -1;
3956
Victor Stinner596a6c42011-11-09 00:02:18 +01003957 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3958 outsize = PyUnicode_GET_LENGTH(*output);
3959 else
3960 outsize = _PyUnicode_WSTR_LENGTH(*output);
3961
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 *errorHandler = PyCodec_LookupError(errors);
3964 if (*errorHandler == NULL)
3965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 }
3967
Victor Stinner554f3f02010-06-16 23:33:54 +00003968 make_decode_exception(exceptionObject,
3969 encoding,
3970 *input, *inend - *input,
3971 *startinpos, *endinpos,
3972 reason);
3973 if (*exceptionObject == NULL)
3974 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975
3976 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3977 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003978 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003980 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 }
3983 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003985 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003986 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003987
3988 /* Copy back the bytes variables, which might have been modified by the
3989 callback */
3990 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3991 if (!inputobj)
3992 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003993 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003995 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003996 *input = PyBytes_AS_STRING(inputobj);
3997 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003998 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003999 /* we can DECREF safely, as the exception has another reference,
4000 so the object won't go away. */
4001 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004005 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4007 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004008 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009
Victor Stinner596a6c42011-11-09 00:02:18 +01004010 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4011 /* need more space? (at least enough for what we
4012 have+the replacement+the rest of the string (starting
4013 at the new input position), so we won't have to check space
4014 when there are no errors in the rest of the string) */
4015 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4016 requiredsize = *outpos + replen + insize-newpos;
4017 if (requiredsize > outsize) {
4018 if (requiredsize<2*outsize)
4019 requiredsize = 2*outsize;
4020 if (unicode_resize(output, requiredsize) < 0)
4021 goto onError;
4022 }
4023 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004024 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004025 copy_characters(*output, *outpos, repunicode, 0, replen);
4026 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004028 else {
4029 wchar_t *repwstr;
4030 Py_ssize_t repwlen;
4031 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4032 if (repwstr == NULL)
4033 goto onError;
4034 /* need more space? (at least enough for what we
4035 have+the replacement+the rest of the string (starting
4036 at the new input position), so we won't have to check space
4037 when there are no errors in the rest of the string) */
4038 requiredsize = *outpos + repwlen + insize-newpos;
4039 if (requiredsize > outsize) {
4040 if (requiredsize < 2*outsize)
4041 requiredsize = 2*outsize;
4042 if (unicode_resize(output, requiredsize) < 0)
4043 goto onError;
4044 }
4045 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4046 *outpos += repwlen;
4047 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004049 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004050
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 /* we made it! */
4052 res = 0;
4053
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 Py_XDECREF(restuple);
4056 return res;
4057}
4058
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004059/* --- UTF-7 Codec -------------------------------------------------------- */
4060
Antoine Pitrou244651a2009-05-04 18:56:13 +00004061/* See RFC2152 for details. We encode conservatively and decode liberally. */
4062
4063/* Three simple macros defining base-64. */
4064
4065/* Is c a base-64 character? */
4066
4067#define IS_BASE64(c) \
4068 (((c) >= 'A' && (c) <= 'Z') || \
4069 ((c) >= 'a' && (c) <= 'z') || \
4070 ((c) >= '0' && (c) <= '9') || \
4071 (c) == '+' || (c) == '/')
4072
4073/* given that c is a base-64 character, what is its base-64 value? */
4074
4075#define FROM_BASE64(c) \
4076 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4077 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4078 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4079 (c) == '+' ? 62 : 63)
4080
4081/* What is the base-64 character of the bottom 6 bits of n? */
4082
4083#define TO_BASE64(n) \
4084 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4085
4086/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4087 * decoded as itself. We are permissive on decoding; the only ASCII
4088 * byte not decoding to itself is the + which begins a base64
4089 * string. */
4090
4091#define DECODE_DIRECT(c) \
4092 ((c) <= 127 && (c) != '+')
4093
4094/* The UTF-7 encoder treats ASCII characters differently according to
4095 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4096 * the above). See RFC2152. This array identifies these different
4097 * sets:
4098 * 0 : "Set D"
4099 * alphanumeric and '(),-./:?
4100 * 1 : "Set O"
4101 * !"#$%&*;<=>@[]^_`{|}
4102 * 2 : "whitespace"
4103 * ht nl cr sp
4104 * 3 : special (must be base64 encoded)
4105 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4106 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004107
Tim Petersced69f82003-09-16 20:30:58 +00004108static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004109char utf7_category[128] = {
4110/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4111 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4112/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4113 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4114/* sp ! " # $ % & ' ( ) * + , - . / */
4115 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4116/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4118/* @ A B C D E F G H I J K L M N O */
4119 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4120/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4122/* ` a b c d e f g h i j k l m n o */
4123 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4124/* p q r s t u v w x y z { | } ~ del */
4125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004126};
4127
Antoine Pitrou244651a2009-05-04 18:56:13 +00004128/* ENCODE_DIRECT: this character should be encoded as itself. The
4129 * answer depends on whether we are encoding set O as itself, and also
4130 * on whether we are encoding whitespace as itself. RFC2152 makes it
4131 * clear that the answers to these questions vary between
4132 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004133
Antoine Pitrou244651a2009-05-04 18:56:13 +00004134#define ENCODE_DIRECT(c, directO, directWS) \
4135 ((c) < 128 && (c) > 0 && \
4136 ((utf7_category[(c)] == 0) || \
4137 (directWS && (utf7_category[(c)] == 2)) || \
4138 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004139
Alexander Belopolsky40018472011-02-26 01:02:56 +00004140PyObject *
4141PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004142 Py_ssize_t size,
4143 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004144{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004145 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4146}
4147
Antoine Pitrou244651a2009-05-04 18:56:13 +00004148/* The decoder. The only state we preserve is our read position,
4149 * i.e. how many characters we have consumed. So if we end in the
4150 * middle of a shift sequence we have to back off the read position
4151 * and the output to the beginning of the sequence, otherwise we lose
4152 * all the shift state (seen bits, number of bits seen, high
4153 * surrogate). */
4154
Alexander Belopolsky40018472011-02-26 01:02:56 +00004155PyObject *
4156PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004157 Py_ssize_t size,
4158 const char *errors,
4159 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t startinpos;
4163 Py_ssize_t endinpos;
4164 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004166 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004167 const char *errmsg = "";
4168 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004169 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004170 unsigned int base64bits = 0;
4171 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004172 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 PyObject *errorHandler = NULL;
4174 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004175
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004176 /* Start off assuming it's all ASCII. Widen later as necessary. */
4177 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004178 if (!unicode)
4179 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004180 if (size == 0) {
4181 if (consumed)
4182 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004183 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004184 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004185
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004186 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004187 e = s + size;
4188
4189 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004190 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004192 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004193
Antoine Pitrou244651a2009-05-04 18:56:13 +00004194 if (inShift) { /* in a base-64 section */
4195 if (IS_BASE64(ch)) { /* consume a base-64 character */
4196 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4197 base64bits += 6;
4198 s++;
4199 if (base64bits >= 16) {
4200 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004201 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004202 base64bits -= 16;
4203 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4204 if (surrogate) {
4205 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004206 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4207 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004208 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4209 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004210 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004211 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004212 }
4213 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004214 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4215 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004216 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004217 }
4218 }
Victor Stinner551ac952011-11-29 22:58:13 +01004219 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004220 /* first surrogate */
4221 surrogate = outCh;
4222 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004223 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004224 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4225 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004226 }
4227 }
4228 }
4229 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004230 inShift = 0;
4231 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004232 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004233 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4234 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004235 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004236 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004237 if (base64bits > 0) { /* left-over bits */
4238 if (base64bits >= 6) {
4239 /* We've seen at least one base-64 character */
4240 errmsg = "partial character in shift sequence";
4241 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004242 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004243 else {
4244 /* Some bits remain; they should be zero */
4245 if (base64buffer != 0) {
4246 errmsg = "non-zero padding bits in shift sequence";
4247 goto utf7Error;
4248 }
4249 }
4250 }
4251 if (ch != '-') {
4252 /* '-' is absorbed; other terminating
4253 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004254 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4255 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004257 }
4258 }
4259 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004261 s++; /* consume '+' */
4262 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004264 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4265 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004266 }
4267 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004269 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271 }
4272 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004273 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004274 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4275 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004276 s++;
4277 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004278 else {
4279 startinpos = s-starts;
4280 s++;
4281 errmsg = "unexpected special character";
4282 goto utf7Error;
4283 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004285utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 endinpos = s-starts;
4287 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 errors, &errorHandler,
4289 "utf7", errmsg,
4290 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004291 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293 }
4294
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 /* end of string */
4296
4297 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4298 /* if we're in an inconsistent state, that's an error */
4299 if (surrogate ||
4300 (base64bits >= 6) ||
4301 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004302 endinpos = size;
4303 if (unicode_decode_call_errorhandler(
4304 errors, &errorHandler,
4305 "utf7", "unterminated shift sequence",
4306 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004307 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004308 goto onError;
4309 if (s < e)
4310 goto restart;
4311 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313
4314 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004315 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004317 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004318 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 }
4320 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004321 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004323 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004325 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004326 goto onError;
4327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 Py_XDECREF(errorHandler);
4329 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004330 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004331
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 Py_XDECREF(errorHandler);
4334 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004335 Py_DECREF(unicode);
4336 return NULL;
4337}
4338
4339
Alexander Belopolsky40018472011-02-26 01:02:56 +00004340PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004341_PyUnicode_EncodeUTF7(PyObject *str,
4342 int base64SetO,
4343 int base64WhiteSpace,
4344 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004346 int kind;
4347 void *data;
4348 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004349 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004350 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004352 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 unsigned int base64bits = 0;
4354 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355 char * out;
4356 char * start;
4357
Benjamin Petersonbac79492012-01-14 13:34:47 -05004358 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004359 return NULL;
4360 kind = PyUnicode_KIND(str);
4361 data = PyUnicode_DATA(str);
4362 len = PyUnicode_GET_LENGTH(str);
4363
4364 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004367 /* It might be possible to tighten this worst case */
4368 allocated = 8 * len;
4369 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004370 return PyErr_NoMemory();
4371
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373 if (v == NULL)
4374 return NULL;
4375
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004376 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004377 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 if (inShift) {
4381 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4382 /* shifting out */
4383 if (base64bits) { /* output remaining bits */
4384 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4385 base64buffer = 0;
4386 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 }
4388 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 /* Characters not in the BASE64 set implicitly unshift the sequence
4390 so no '-' is required, except if the character is itself a '-' */
4391 if (IS_BASE64(ch) || ch == '-') {
4392 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 *out++ = (char) ch;
4395 }
4396 else {
4397 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 else { /* not in a shift sequence */
4401 if (ch == '+') {
4402 *out++ = '+';
4403 *out++ = '-';
4404 }
4405 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4406 *out++ = (char) ch;
4407 }
4408 else {
4409 *out++ = '+';
4410 inShift = 1;
4411 goto encode_char;
4412 }
4413 }
4414 continue;
4415encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004417 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004418
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 /* code first surrogate */
4420 base64bits += 16;
4421 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4422 while (base64bits >= 6) {
4423 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4424 base64bits -= 6;
4425 }
4426 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004427 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 base64bits += 16;
4430 base64buffer = (base64buffer << 16) | ch;
4431 while (base64bits >= 6) {
4432 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4433 base64bits -= 6;
4434 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004435 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 if (base64bits)
4437 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4438 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004440 if (_PyBytes_Resize(&v, out - start) < 0)
4441 return NULL;
4442 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004444PyObject *
4445PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4446 Py_ssize_t size,
4447 int base64SetO,
4448 int base64WhiteSpace,
4449 const char *errors)
4450{
4451 PyObject *result;
4452 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4453 if (tmp == NULL)
4454 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004455 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004456 base64WhiteSpace, errors);
4457 Py_DECREF(tmp);
4458 return result;
4459}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461#undef IS_BASE64
4462#undef FROM_BASE64
4463#undef TO_BASE64
4464#undef DECODE_DIRECT
4465#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467/* --- UTF-8 Codec -------------------------------------------------------- */
4468
Tim Petersced69f82003-09-16 20:30:58 +00004469static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004471 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4472 illegal prefix. See RFC 3629 for details */
4473 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4474 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4478 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004480 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4485 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4486 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4487 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4488 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489};
4490
Alexander Belopolsky40018472011-02-26 01:02:56 +00004491PyObject *
4492PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004493 Py_ssize_t size,
4494 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495{
Walter Dörwald69652032004-09-07 20:24:22 +00004496 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4497}
4498
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004499#include "stringlib/ucs1lib.h"
4500#include "stringlib/codecs.h"
4501#include "stringlib/undef.h"
4502
4503#include "stringlib/ucs2lib.h"
4504#include "stringlib/codecs.h"
4505#include "stringlib/undef.h"
4506
4507#include "stringlib/ucs4lib.h"
4508#include "stringlib/codecs.h"
4509#include "stringlib/undef.h"
4510
Antoine Pitrouab868312009-01-10 15:40:25 +00004511/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4512#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4513
4514/* Mask to quickly check whether a C 'long' contains a
4515 non-ASCII, UTF8-encoded char. */
4516#if (SIZEOF_LONG == 8)
4517# define ASCII_CHAR_MASK 0x8080808080808080L
4518#elif (SIZEOF_LONG == 4)
4519# define ASCII_CHAR_MASK 0x80808080L
4520#else
4521# error C 'long' size should be either 4 or 8!
4522#endif
4523
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004524/* Scans a UTF-8 string and returns the maximum character to be expected
4525 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004526
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004527 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004528 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004529 */
4530static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004531utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004533 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004534 const unsigned char *end = p + string_size;
4535 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004536
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004537 assert(unicode_size != NULL);
4538
4539 /* By having a cascade of independent loops which fallback onto each
4540 other, we minimize the amount of work done in the average loop
4541 iteration, and we also maximize the CPU's ability to predict
4542 branches correctly (because a given condition will have always the
4543 same boolean outcome except perhaps in the last iteration of the
4544 corresponding loop).
4545 In the general case this brings us rather close to decoding
4546 performance pre-PEP 393, despite the two-pass decoding.
4547
4548 Note that the pure ASCII loop is not duplicated once a non-ASCII
4549 character has been encountered. It is actually a pessimization (by
4550 a significant factor) to use this loop on text with many non-ASCII
4551 characters, and it is important to avoid bad performance on valid
4552 utf-8 data (invalid utf-8 being a different can of worms).
4553 */
4554
4555 /* ASCII */
4556 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004557 /* Only check value if it's not a ASCII char... */
4558 if (*p < 0x80) {
4559 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4560 an explanation. */
4561 if (!((size_t) p & LONG_PTR_MASK)) {
4562 /* Help register allocation */
4563 register const unsigned char *_p = p;
4564 while (_p < aligned_end) {
4565 unsigned long value = *(unsigned long *) _p;
4566 if (value & ASCII_CHAR_MASK)
4567 break;
4568 _p += SIZEOF_LONG;
4569 char_count += SIZEOF_LONG;
4570 }
4571 p = _p;
4572 if (p == end)
4573 break;
4574 }
4575 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004576 if (*p < 0x80)
4577 ++char_count;
4578 else
4579 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004580 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004581 *unicode_size = char_count;
4582 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004584_ucs1loop:
4585 for (; p < end; ++p) {
4586 if (*p < 0xc4)
4587 char_count += ((*p & 0xc0) != 0x80);
4588 else
4589 goto _ucs2loop;
4590 }
4591 *unicode_size = char_count;
4592 return 255;
4593
4594_ucs2loop:
4595 for (; p < end; ++p) {
4596 if (*p < 0xf0)
4597 char_count += ((*p & 0xc0) != 0x80);
4598 else
4599 goto _ucs4loop;
4600 }
4601 *unicode_size = char_count;
4602 return 65535;
4603
4604_ucs4loop:
4605 for (; p < end; ++p) {
4606 char_count += ((*p & 0xc0) != 0x80);
4607 }
4608 *unicode_size = char_count;
4609 return 65537;
4610}
4611
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004612/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004613 in case of errors. Implicit parameters: unicode, kind, data, onError.
4614 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004615*/
Victor Stinner785938e2011-12-11 20:09:03 +01004616#define WRITE_MAYBE_FAIL(index, value) \
4617 do { \
4618 Py_ssize_t pos = index; \
4619 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4620 unicode_resize(&unicode, pos + pos/8) < 0) \
4621 goto onError; \
4622 if (unicode_putchar(&unicode, &pos, value) < 0) \
4623 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004624 } while (0)
4625
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004626static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004627decode_utf8_errors(const char *starts,
4628 Py_ssize_t size,
4629 const char *errors,
4630 Py_ssize_t *consumed,
4631 const char *s,
4632 PyObject *unicode,
4633 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004634{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004636 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004637 Py_ssize_t startinpos;
4638 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004639 const char *e = starts + size;
4640 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004641 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 PyObject *errorHandler = NULL;
4643 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004644
Antoine Pitrouab868312009-01-10 15:40:25 +00004645 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646
4647 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004648 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649
4650 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004651 /* Fast path for runs of ASCII characters. Given that common UTF-8
4652 input will consist of an overwhelming majority of ASCII
4653 characters, we try to optimize for this case by checking
4654 as many characters as a C 'long' can contain.
4655 First, check if we can do an aligned read, as most CPUs have
4656 a penalty for unaligned reads.
4657 */
4658 if (!((size_t) s & LONG_PTR_MASK)) {
4659 /* Help register allocation */
4660 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004661 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004662 while (_s < aligned_end) {
4663 /* Read a whole long at a time (either 4 or 8 bytes),
4664 and do a fast unrolled copy if it only contains ASCII
4665 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004666 unsigned long value = *(unsigned long *) _s;
4667 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004668 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004669 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4670 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4671 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4672 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004673#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004674 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4675 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4676 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4677 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004678#endif
4679 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004681 }
4682 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004683 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004684 if (s == e)
4685 break;
4686 ch = (unsigned char)*s;
4687 }
4688 }
4689
4690 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004691 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 s++;
4693 continue;
4694 }
4695
4696 n = utf8_code_length[ch];
4697
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004698 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004699 if (consumed)
4700 break;
4701 else {
4702 errmsg = "unexpected end of data";
4703 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004704 endinpos = startinpos+1;
4705 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4706 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 goto utf8Error;
4708 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710
4711 switch (n) {
4712
4713 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004714 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 startinpos = s-starts;
4716 endinpos = startinpos+1;
4717 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
4719 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004720 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 startinpos = s-starts;
4722 endinpos = startinpos+1;
4723 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724
4725 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004726 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004727 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004729 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 goto utf8Error;
4731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004733 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004734 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 break;
4736
4737 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004738 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4739 will result in surrogates in range d800-dfff. Surrogates are
4740 not valid UTF-8 so they are rejected.
4741 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4742 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004743 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004744 (s[2] & 0xc0) != 0x80 ||
4745 ((unsigned char)s[0] == 0xE0 &&
4746 (unsigned char)s[1] < 0xA0) ||
4747 ((unsigned char)s[0] == 0xED &&
4748 (unsigned char)s[1] > 0x9F)) {
4749 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004751 endinpos = startinpos + 1;
4752
4753 /* if s[1] first two bits are 1 and 0, then the invalid
4754 continuation byte is s[2], so increment endinpos by 1,
4755 if not, s[1] is invalid and endinpos doesn't need to
4756 be incremented. */
4757 if ((s[1] & 0xC0) == 0x80)
4758 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 goto utf8Error;
4760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004762 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004763 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004764 break;
4765
4766 case 4:
4767 if ((s[1] & 0xc0) != 0x80 ||
4768 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004769 (s[3] & 0xc0) != 0x80 ||
4770 ((unsigned char)s[0] == 0xF0 &&
4771 (unsigned char)s[1] < 0x90) ||
4772 ((unsigned char)s[0] == 0xF4 &&
4773 (unsigned char)s[1] > 0x8F)) {
4774 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004776 endinpos = startinpos + 1;
4777 if ((s[1] & 0xC0) == 0x80) {
4778 endinpos++;
4779 if ((s[2] & 0xC0) == 0x80)
4780 endinpos++;
4781 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 goto utf8Error;
4783 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004784 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004785 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004786 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004787
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004788 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 }
4791 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004793
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 if (unicode_decode_call_errorhandler(
4796 errors, &errorHandler,
4797 "utf8", errmsg,
4798 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004799 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004801 /* Update data because unicode_decode_call_errorhandler might have
4802 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 }
Walter Dörwald69652032004-09-07 20:24:22 +00004805 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004806 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 /* Adjust length and ready string when it contained errors and
4809 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004810 if (unicode_resize(&unicode, i) < 0)
4811 goto onError;
4812 unicode_adjust_maxchar(&unicode);
4813 if (unicode == NULL)
4814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 Py_XDECREF(errorHandler);
4817 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004818 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004819 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 Py_XDECREF(errorHandler);
4823 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004824 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 return NULL;
4826}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004827#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004828
Victor Stinner785938e2011-12-11 20:09:03 +01004829PyObject *
4830PyUnicode_DecodeUTF8Stateful(const char *s,
4831 Py_ssize_t size,
4832 const char *errors,
4833 Py_ssize_t *consumed)
4834{
4835 Py_UCS4 maxchar = 0;
4836 Py_ssize_t unicode_size;
4837 int has_errors = 0;
4838 PyObject *unicode;
4839 int kind;
4840 void *data;
4841 const char *starts = s;
4842 const char *e;
4843 Py_ssize_t i;
4844
4845 if (size == 0) {
4846 if (consumed)
4847 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004848 Py_INCREF(unicode_empty);
4849 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004850 }
4851
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004852 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004853
4854 /* When the string is ASCII only, just use memcpy and return.
4855 unicode_size may be != size if there is an incomplete UTF-8
4856 sequence at the end of the ASCII block. */
4857 if (maxchar < 128 && size == unicode_size) {
4858 if (consumed)
4859 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004860 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004861 }
4862
4863 unicode = PyUnicode_New(unicode_size, maxchar);
4864 if (!unicode)
4865 return NULL;
4866 kind = PyUnicode_KIND(unicode);
4867 data = PyUnicode_DATA(unicode);
4868
4869 /* Unpack UTF-8 encoded data */
4870 i = 0;
4871 e = starts + size;
4872 switch (kind) {
4873 case PyUnicode_1BYTE_KIND:
4874 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4875 break;
4876 case PyUnicode_2BYTE_KIND:
4877 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4878 break;
4879 case PyUnicode_4BYTE_KIND:
4880 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4881 break;
4882 }
4883 if (!has_errors) {
4884 /* Ensure the unicode size calculation was correct */
4885 assert(i == unicode_size);
4886 assert(s == e);
4887 if (consumed)
4888 *consumed = size;
4889 return unicode;
4890 }
4891
4892 /* In case of errors, maxchar and size computation might be incorrect;
4893 code below refits and resizes as necessary. */
4894 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4895}
4896
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004897#ifdef __APPLE__
4898
4899/* Simplified UTF-8 decoder using surrogateescape error handler,
4900 used to decode the command line arguments on Mac OS X. */
4901
4902wchar_t*
4903_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4904{
4905 int n;
4906 const char *e;
4907 wchar_t *unicode, *p;
4908
4909 /* Note: size will always be longer than the resulting Unicode
4910 character count */
4911 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4912 PyErr_NoMemory();
4913 return NULL;
4914 }
4915 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4916 if (!unicode)
4917 return NULL;
4918
4919 /* Unpack UTF-8 encoded data */
4920 p = unicode;
4921 e = s + size;
4922 while (s < e) {
4923 Py_UCS4 ch = (unsigned char)*s;
4924
4925 if (ch < 0x80) {
4926 *p++ = (wchar_t)ch;
4927 s++;
4928 continue;
4929 }
4930
4931 n = utf8_code_length[ch];
4932 if (s + n > e) {
4933 goto surrogateescape;
4934 }
4935
4936 switch (n) {
4937 case 0:
4938 case 1:
4939 goto surrogateescape;
4940
4941 case 2:
4942 if ((s[1] & 0xc0) != 0x80)
4943 goto surrogateescape;
4944 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4945 assert ((ch > 0x007F) && (ch <= 0x07FF));
4946 *p++ = (wchar_t)ch;
4947 break;
4948
4949 case 3:
4950 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4951 will result in surrogates in range d800-dfff. Surrogates are
4952 not valid UTF-8 so they are rejected.
4953 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4954 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4955 if ((s[1] & 0xc0) != 0x80 ||
4956 (s[2] & 0xc0) != 0x80 ||
4957 ((unsigned char)s[0] == 0xE0 &&
4958 (unsigned char)s[1] < 0xA0) ||
4959 ((unsigned char)s[0] == 0xED &&
4960 (unsigned char)s[1] > 0x9F)) {
4961
4962 goto surrogateescape;
4963 }
4964 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4965 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004966 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004967 break;
4968
4969 case 4:
4970 if ((s[1] & 0xc0) != 0x80 ||
4971 (s[2] & 0xc0) != 0x80 ||
4972 (s[3] & 0xc0) != 0x80 ||
4973 ((unsigned char)s[0] == 0xF0 &&
4974 (unsigned char)s[1] < 0x90) ||
4975 ((unsigned char)s[0] == 0xF4 &&
4976 (unsigned char)s[1] > 0x8F)) {
4977 goto surrogateescape;
4978 }
4979 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4980 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004981 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004982
4983#if SIZEOF_WCHAR_T == 4
4984 *p++ = (wchar_t)ch;
4985#else
4986 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004987 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4988 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004989#endif
4990 break;
4991 }
4992 s += n;
4993 continue;
4994
4995 surrogateescape:
4996 *p++ = 0xDC00 + ch;
4997 s++;
4998 }
4999 *p = L'\0';
5000 return unicode;
5001}
5002
5003#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005005/* Primary internal function which creates utf8 encoded bytes objects.
5006
5007 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005008 and allocate exactly as much space needed at the end. Else allocate the
5009 maximum possible needed (4 result bytes per Unicode character), and return
5010 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005011*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005012PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005013_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014{
Victor Stinner6099a032011-12-18 14:22:26 +01005015 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005016 void *data;
5017 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005019 if (!PyUnicode_Check(unicode)) {
5020 PyErr_BadArgument();
5021 return NULL;
5022 }
5023
5024 if (PyUnicode_READY(unicode) == -1)
5025 return NULL;
5026
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005027 if (PyUnicode_UTF8(unicode))
5028 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5029 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005030
5031 kind = PyUnicode_KIND(unicode);
5032 data = PyUnicode_DATA(unicode);
5033 size = PyUnicode_GET_LENGTH(unicode);
5034
Benjamin Petersonead6b532011-12-20 17:23:42 -06005035 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005036 default:
5037 assert(0);
5038 case PyUnicode_1BYTE_KIND:
5039 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5040 assert(!PyUnicode_IS_ASCII(unicode));
5041 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5042 case PyUnicode_2BYTE_KIND:
5043 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5044 case PyUnicode_4BYTE_KIND:
5045 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047}
5048
Alexander Belopolsky40018472011-02-26 01:02:56 +00005049PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005050PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5051 Py_ssize_t size,
5052 const char *errors)
5053{
5054 PyObject *v, *unicode;
5055
5056 unicode = PyUnicode_FromUnicode(s, size);
5057 if (unicode == NULL)
5058 return NULL;
5059 v = _PyUnicode_AsUTF8String(unicode, errors);
5060 Py_DECREF(unicode);
5061 return v;
5062}
5063
5064PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005065PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005067 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068}
5069
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070/* --- UTF-32 Codec ------------------------------------------------------- */
5071
5072PyObject *
5073PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 Py_ssize_t size,
5075 const char *errors,
5076 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077{
5078 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5079}
5080
5081PyObject *
5082PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 Py_ssize_t size,
5084 const char *errors,
5085 int *byteorder,
5086 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087{
5088 const char *starts = s;
5089 Py_ssize_t startinpos;
5090 Py_ssize_t endinpos;
5091 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005092 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005093 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 int bo = 0; /* assume native ordering by default */
5095 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 /* Offsets from q for retrieving bytes in the right order. */
5097#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5098 int iorder[] = {0, 1, 2, 3};
5099#else
5100 int iorder[] = {3, 2, 1, 0};
5101#endif
5102 PyObject *errorHandler = NULL;
5103 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005104
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105 q = (unsigned char *)s;
5106 e = q + size;
5107
5108 if (byteorder)
5109 bo = *byteorder;
5110
5111 /* Check for BOM marks (U+FEFF) in the input and adjust current
5112 byte order setting accordingly. In native mode, the leading BOM
5113 mark is skipped, in all other modes, it is copied to the output
5114 stream as-is (giving a ZWNBSP character). */
5115 if (bo == 0) {
5116 if (size >= 4) {
5117 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 if (bom == 0x0000FEFF) {
5121 q += 4;
5122 bo = -1;
5123 }
5124 else if (bom == 0xFFFE0000) {
5125 q += 4;
5126 bo = 1;
5127 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 if (bom == 0x0000FEFF) {
5130 q += 4;
5131 bo = 1;
5132 }
5133 else if (bom == 0xFFFE0000) {
5134 q += 4;
5135 bo = -1;
5136 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005139 }
5140
5141 if (bo == -1) {
5142 /* force LE */
5143 iorder[0] = 0;
5144 iorder[1] = 1;
5145 iorder[2] = 2;
5146 iorder[3] = 3;
5147 }
5148 else if (bo == 1) {
5149 /* force BE */
5150 iorder[0] = 3;
5151 iorder[1] = 2;
5152 iorder[2] = 1;
5153 iorder[3] = 0;
5154 }
5155
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005156 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005157 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005158 if (!unicode)
5159 return NULL;
5160 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005161 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005162 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005163
Walter Dörwald41980ca2007-08-16 21:55:45 +00005164 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 Py_UCS4 ch;
5166 /* remaining bytes at the end? (size should be divisible by 4) */
5167 if (e-q<4) {
5168 if (consumed)
5169 break;
5170 errmsg = "truncated data";
5171 startinpos = ((const char *)q)-starts;
5172 endinpos = ((const char *)e)-starts;
5173 goto utf32Error;
5174 /* The remaining input chars are ignored if the callback
5175 chooses to skip the input */
5176 }
5177 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5178 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 if (ch >= 0x110000)
5181 {
5182 errmsg = "codepoint not in range(0x110000)";
5183 startinpos = ((const char *)q)-starts;
5184 endinpos = startinpos+4;
5185 goto utf32Error;
5186 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005187 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5188 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 q += 4;
5190 continue;
5191 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 if (unicode_decode_call_errorhandler(
5193 errors, &errorHandler,
5194 "utf32", errmsg,
5195 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005196 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005198 }
5199
5200 if (byteorder)
5201 *byteorder = bo;
5202
5203 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005205
5206 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005207 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005208 goto onError;
5209
5210 Py_XDECREF(errorHandler);
5211 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005212 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005213
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005215 Py_DECREF(unicode);
5216 Py_XDECREF(errorHandler);
5217 Py_XDECREF(exc);
5218 return NULL;
5219}
5220
5221PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005222_PyUnicode_EncodeUTF32(PyObject *str,
5223 const char *errors,
5224 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005226 int kind;
5227 void *data;
5228 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005229 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005231 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 /* Offsets from p for storing byte pairs in the right order. */
5233#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5234 int iorder[] = {0, 1, 2, 3};
5235#else
5236 int iorder[] = {3, 2, 1, 0};
5237#endif
5238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239#define STORECHAR(CH) \
5240 do { \
5241 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5242 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5243 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5244 p[iorder[0]] = (CH) & 0xff; \
5245 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246 } while(0)
5247
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005248 if (!PyUnicode_Check(str)) {
5249 PyErr_BadArgument();
5250 return NULL;
5251 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005252 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005253 return NULL;
5254 kind = PyUnicode_KIND(str);
5255 data = PyUnicode_DATA(str);
5256 len = PyUnicode_GET_LENGTH(str);
5257
5258 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005259 bytesize = nsize * 4;
5260 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005262 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005263 if (v == NULL)
5264 return NULL;
5265
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005266 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005270 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005271
5272 if (byteorder == -1) {
5273 /* force LE */
5274 iorder[0] = 0;
5275 iorder[1] = 1;
5276 iorder[2] = 2;
5277 iorder[3] = 3;
5278 }
5279 else if (byteorder == 1) {
5280 /* force BE */
5281 iorder[0] = 3;
5282 iorder[1] = 2;
5283 iorder[2] = 1;
5284 iorder[3] = 0;
5285 }
5286
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005287 for (i = 0; i < len; i++)
5288 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005289
5290 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005291 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005292#undef STORECHAR
5293}
5294
Alexander Belopolsky40018472011-02-26 01:02:56 +00005295PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005296PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5297 Py_ssize_t size,
5298 const char *errors,
5299 int byteorder)
5300{
5301 PyObject *result;
5302 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5303 if (tmp == NULL)
5304 return NULL;
5305 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5306 Py_DECREF(tmp);
5307 return result;
5308}
5309
5310PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005311PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312{
Victor Stinnerb960b342011-11-20 19:12:52 +01005313 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314}
5315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316/* --- UTF-16 Codec ------------------------------------------------------- */
5317
Tim Peters772747b2001-08-09 22:21:55 +00005318PyObject *
5319PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 Py_ssize_t size,
5321 const char *errors,
5322 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323{
Walter Dörwald69652032004-09-07 20:24:22 +00005324 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5325}
5326
Antoine Pitrouab868312009-01-10 15:40:25 +00005327/* Two masks for fast checking of whether a C 'long' may contain
5328 UTF16-encoded surrogate characters. This is an efficient heuristic,
5329 assuming that non-surrogate characters with a code point >= 0x8000 are
5330 rare in most input.
5331 FAST_CHAR_MASK is used when the input is in native byte ordering,
5332 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005333*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005334#if (SIZEOF_LONG == 8)
5335# define FAST_CHAR_MASK 0x8000800080008000L
5336# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5337#elif (SIZEOF_LONG == 4)
5338# define FAST_CHAR_MASK 0x80008000L
5339# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5340#else
5341# error C 'long' size should be either 4 or 8!
5342#endif
5343
Walter Dörwald69652032004-09-07 20:24:22 +00005344PyObject *
5345PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 Py_ssize_t size,
5347 const char *errors,
5348 int *byteorder,
5349 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t startinpos;
5353 Py_ssize_t endinpos;
5354 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005355 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005356 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005357 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005358 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005359 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005360 /* Offsets from q for retrieving byte pairs in the right order. */
5361#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5362 int ihi = 1, ilo = 0;
5363#else
5364 int ihi = 0, ilo = 1;
5365#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 PyObject *errorHandler = NULL;
5367 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368
5369 /* Note: size will always be longer than the resulting Unicode
5370 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005371 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 if (!unicode)
5373 return NULL;
5374 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005375 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005376 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
Tim Peters772747b2001-08-09 22:21:55 +00005378 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005379 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380
5381 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005382 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005384 /* Check for BOM marks (U+FEFF) in the input and adjust current
5385 byte order setting accordingly. In native mode, the leading BOM
5386 mark is skipped, in all other modes, it is copied to the output
5387 stream as-is (giving a ZWNBSP character). */
5388 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005389 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005390 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005391#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 if (bom == 0xFEFF) {
5393 q += 2;
5394 bo = -1;
5395 }
5396 else if (bom == 0xFFFE) {
5397 q += 2;
5398 bo = 1;
5399 }
Tim Petersced69f82003-09-16 20:30:58 +00005400#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 if (bom == 0xFEFF) {
5402 q += 2;
5403 bo = 1;
5404 }
5405 else if (bom == 0xFFFE) {
5406 q += 2;
5407 bo = -1;
5408 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005409#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412
Tim Peters772747b2001-08-09 22:21:55 +00005413 if (bo == -1) {
5414 /* force LE */
5415 ihi = 1;
5416 ilo = 0;
5417 }
5418 else if (bo == 1) {
5419 /* force BE */
5420 ihi = 0;
5421 ilo = 1;
5422 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5424 native_ordering = ilo < ihi;
5425#else
5426 native_ordering = ilo > ihi;
5427#endif
Tim Peters772747b2001-08-09 22:21:55 +00005428
Antoine Pitrouab868312009-01-10 15:40:25 +00005429 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005430 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005431 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005432 /* First check for possible aligned read of a C 'long'. Unaligned
5433 reads are more expensive, better to defer to another iteration. */
5434 if (!((size_t) q & LONG_PTR_MASK)) {
5435 /* Fast path for runs of non-surrogate chars. */
5436 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005437 int kind = PyUnicode_KIND(unicode);
5438 void *data = PyUnicode_DATA(unicode);
5439 while (_q < aligned_end) {
5440 unsigned long block = * (unsigned long *) _q;
5441 unsigned short *pblock = (unsigned short*)&block;
5442 Py_UCS4 maxch;
5443 if (native_ordering) {
5444 /* Can use buffer directly */
5445 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005446 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005447 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005448 else {
5449 /* Need to byte-swap */
5450 unsigned char *_p = (unsigned char*)pblock;
5451 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005452 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005453 _p[0] = _q[1];
5454 _p[1] = _q[0];
5455 _p[2] = _q[3];
5456 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005457#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005458 _p[4] = _q[5];
5459 _p[5] = _q[4];
5460 _p[6] = _q[7];
5461 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005462#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005463 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005464 maxch = Py_MAX(pblock[0], pblock[1]);
5465#if SIZEOF_LONG == 8
5466 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5467#endif
5468 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5469 if (unicode_widen(&unicode, maxch) < 0)
5470 goto onError;
5471 kind = PyUnicode_KIND(unicode);
5472 data = PyUnicode_DATA(unicode);
5473 }
5474 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5475 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5476#if SIZEOF_LONG == 8
5477 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5478 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5479#endif
5480 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005481 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005482 q = _q;
5483 if (q >= e)
5484 break;
5485 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487
Benjamin Peterson14339b62009-01-31 16:36:08 +00005488 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005489
Victor Stinner551ac952011-11-29 22:58:13 +01005490 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005491 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5492 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 continue;
5494 }
5495
5496 /* UTF-16 code pair: */
5497 if (q > e) {
5498 errmsg = "unexpected end of data";
5499 startinpos = (((const char *)q) - 2) - starts;
5500 endinpos = ((const char *)e) + 1 - starts;
5501 goto utf16Error;
5502 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005503 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5504 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005506 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005507 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005508 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005509 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 continue;
5511 }
5512 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005513 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 startinpos = (((const char *)q)-4)-starts;
5515 endinpos = startinpos+2;
5516 goto utf16Error;
5517 }
5518
Benjamin Peterson14339b62009-01-31 16:36:08 +00005519 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 errmsg = "illegal encoding";
5521 startinpos = (((const char *)q)-2)-starts;
5522 endinpos = startinpos+2;
5523 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005524
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005527 errors,
5528 &errorHandler,
5529 "utf16", errmsg,
5530 &starts,
5531 (const char **)&e,
5532 &startinpos,
5533 &endinpos,
5534 &exc,
5535 (const char **)&q,
5536 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005537 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005540 /* remaining byte at the end? (size should be even) */
5541 if (e == q) {
5542 if (!consumed) {
5543 errmsg = "truncated data";
5544 startinpos = ((const char *)q) - starts;
5545 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005546 if (unicode_decode_call_errorhandler(
5547 errors,
5548 &errorHandler,
5549 "utf16", errmsg,
5550 &starts,
5551 (const char **)&e,
5552 &startinpos,
5553 &endinpos,
5554 &exc,
5555 (const char **)&q,
5556 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005557 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005558 goto onError;
5559 /* The remaining input chars are ignored if the callback
5560 chooses to skip the input */
5561 }
5562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
5564 if (byteorder)
5565 *byteorder = bo;
5566
Walter Dörwald69652032004-09-07 20:24:22 +00005567 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005569
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005571 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 goto onError;
5573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 Py_XDECREF(errorHandler);
5575 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005576 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 Py_XDECREF(errorHandler);
5581 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 return NULL;
5583}
5584
Antoine Pitrouab868312009-01-10 15:40:25 +00005585#undef FAST_CHAR_MASK
5586#undef SWAPPED_FAST_CHAR_MASK
5587
Tim Peters772747b2001-08-09 22:21:55 +00005588PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005589_PyUnicode_EncodeUTF16(PyObject *str,
5590 const char *errors,
5591 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005593 int kind;
5594 void *data;
5595 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005596 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005597 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005598 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005599 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005600 /* Offsets from p for storing byte pairs in the right order. */
5601#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5602 int ihi = 1, ilo = 0;
5603#else
5604 int ihi = 0, ilo = 1;
5605#endif
5606
Benjamin Peterson29060642009-01-31 22:14:21 +00005607#define STORECHAR(CH) \
5608 do { \
5609 p[ihi] = ((CH) >> 8) & 0xff; \
5610 p[ilo] = (CH) & 0xff; \
5611 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005612 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005614 if (!PyUnicode_Check(str)) {
5615 PyErr_BadArgument();
5616 return NULL;
5617 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005618 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 return NULL;
5620 kind = PyUnicode_KIND(str);
5621 data = PyUnicode_DATA(str);
5622 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005623
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005624 pairs = 0;
5625 if (kind == PyUnicode_4BYTE_KIND)
5626 for (i = 0; i < len; i++)
5627 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5628 pairs++;
5629 /* 2 * (len + pairs + (byteorder == 0)) */
5630 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005632 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005633 bytesize = nsize * 2;
5634 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005636 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 if (v == NULL)
5638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005640 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005644 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005645
5646 if (byteorder == -1) {
5647 /* force LE */
5648 ihi = 1;
5649 ilo = 0;
5650 }
5651 else if (byteorder == 1) {
5652 /* force BE */
5653 ihi = 0;
5654 ilo = 1;
5655 }
5656
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 for (i = 0; i < len; i++) {
5658 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5659 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005661 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5662 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 }
Tim Peters772747b2001-08-09 22:21:55 +00005664 STORECHAR(ch);
5665 if (ch2)
5666 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005667 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005668
5669 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005670 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005671#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672}
5673
Alexander Belopolsky40018472011-02-26 01:02:56 +00005674PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005675PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5676 Py_ssize_t size,
5677 const char *errors,
5678 int byteorder)
5679{
5680 PyObject *result;
5681 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5682 if (tmp == NULL)
5683 return NULL;
5684 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5685 Py_DECREF(tmp);
5686 return result;
5687}
5688
5689PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005690PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005692 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693}
5694
5695/* --- Unicode Escape Codec ----------------------------------------------- */
5696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5698 if all the escapes in the string make it still a valid ASCII string.
5699 Returns -1 if any escapes were found which cause the string to
5700 pop out of ASCII range. Otherwise returns the length of the
5701 required buffer to hold the string.
5702 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005703static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005704length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5705{
5706 const unsigned char *p = (const unsigned char *)s;
5707 const unsigned char *end = p + size;
5708 Py_ssize_t length = 0;
5709
5710 if (size < 0)
5711 return -1;
5712
5713 for (; p < end; ++p) {
5714 if (*p > 127) {
5715 /* Non-ASCII */
5716 return -1;
5717 }
5718 else if (*p != '\\') {
5719 /* Normal character */
5720 ++length;
5721 }
5722 else {
5723 /* Backslash-escape, check next char */
5724 ++p;
5725 /* Escape sequence reaches till end of string or
5726 non-ASCII follow-up. */
5727 if (p >= end || *p > 127)
5728 return -1;
5729 switch (*p) {
5730 case '\n':
5731 /* backslash + \n result in zero characters */
5732 break;
5733 case '\\': case '\'': case '\"':
5734 case 'b': case 'f': case 't':
5735 case 'n': case 'r': case 'v': case 'a':
5736 ++length;
5737 break;
5738 case '0': case '1': case '2': case '3':
5739 case '4': case '5': case '6': case '7':
5740 case 'x': case 'u': case 'U': case 'N':
5741 /* these do not guarantee ASCII characters */
5742 return -1;
5743 default:
5744 /* count the backslash + the other character */
5745 length += 2;
5746 }
5747 }
5748 }
5749 return length;
5750}
5751
Fredrik Lundh06d12682001-01-24 07:59:11 +00005752static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005753
Alexander Belopolsky40018472011-02-26 01:02:56 +00005754PyObject *
5755PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005756 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005760 Py_ssize_t startinpos;
5761 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005762 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005763 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005765 char* message;
5766 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 PyObject *errorHandler = NULL;
5768 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005769 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005770 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005771
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005772 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005773
5774 /* After length_of_escaped_ascii_string() there are two alternatives,
5775 either the string is pure ASCII with named escapes like \n, etc.
5776 and we determined it's exact size (common case)
5777 or it contains \x, \u, ... escape sequences. then we create a
5778 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005779 if (len >= 0) {
5780 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 if (!v)
5782 goto onError;
5783 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005784 }
5785 else {
5786 /* Escaped strings will always be longer than the resulting
5787 Unicode string, so we start with size here and then reduce the
5788 length after conversion to the true value.
5789 (but if the error callback returns a long replacement string
5790 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005791 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005792 if (!v)
5793 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005794 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 }
5796
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005798 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005799 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005801
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 while (s < end) {
5803 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005804 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005807 /* The only case in which i == ascii_length is a backslash
5808 followed by a newline. */
5809 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005810
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 /* Non-escape characters are interpreted as Unicode ordinals */
5812 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005813 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 continue;
5816 }
5817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 /* \ - Escapes */
5820 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005821 c = *s++;
5822 if (s > end)
5823 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005825 /* The only case in which i == ascii_length is a backslash
5826 followed by a newline. */
5827 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005828
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005829 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005832#define WRITECHAR(ch) \
5833 do { \
5834 if (unicode_putchar(&v, &i, ch) < 0) \
5835 goto onError; \
5836 }while(0)
5837
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005839 case '\\': WRITECHAR('\\'); break;
5840 case '\'': WRITECHAR('\''); break;
5841 case '\"': WRITECHAR('\"'); break;
5842 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005843 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005844 case 'f': WRITECHAR('\014'); break;
5845 case 't': WRITECHAR('\t'); break;
5846 case 'n': WRITECHAR('\n'); break;
5847 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005848 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005849 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005850 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005851 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 case '0': case '1': case '2': case '3':
5855 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005856 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005857 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005858 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005859 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005860 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005862 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 break;
5864
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 /* hex escapes */
5866 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005868 digits = 2;
5869 message = "truncated \\xXX escape";
5870 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005874 digits = 4;
5875 message = "truncated \\uXXXX escape";
5876 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005879 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005880 digits = 8;
5881 message = "truncated \\UXXXXXXXX escape";
5882 hexescape:
5883 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 if (s+digits>end) {
5885 endinpos = size;
5886 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 errors, &errorHandler,
5888 "unicodeescape", "end of string in escape sequence",
5889 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005890 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005891 goto onError;
5892 goto nextByte;
5893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005894 for (j = 0; j < digits; ++j) {
5895 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005896 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005897 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 errors, &errorHandler,
5900 "unicodeescape", message,
5901 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005902 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005903 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005906 }
5907 chr = (chr<<4) & ~0xF;
5908 if (c >= '0' && c <= '9')
5909 chr += c - '0';
5910 else if (c >= 'a' && c <= 'f')
5911 chr += 10 + c - 'a';
5912 else
5913 chr += 10 + c - 'A';
5914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005915 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005916 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 /* _decoding_error will have already written into the
5918 target buffer. */
5919 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005920 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005921 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005922 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005923 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005924 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 errors, &errorHandler,
5928 "unicodeescape", "illegal Unicode character",
5929 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005931 goto onError;
5932 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005933 break;
5934
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005936 case 'N':
5937 message = "malformed \\N character escape";
5938 if (ucnhash_CAPI == NULL) {
5939 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005940 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5941 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005942 if (ucnhash_CAPI == NULL)
5943 goto ucnhashError;
5944 }
5945 if (*s == '{') {
5946 const char *start = s+1;
5947 /* look for the closing brace */
5948 while (*s != '}' && s < end)
5949 s++;
5950 if (s > start && s < end && *s == '}') {
5951 /* found a name. look it up in the unicode database */
5952 message = "unknown Unicode character name";
5953 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005954 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005955 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005956 goto store;
5957 }
5958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 errors, &errorHandler,
5962 "unicodeescape", message,
5963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005964 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005965 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005966 break;
5967
5968 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005969 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 message = "\\ at end of string";
5971 s--;
5972 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005973 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 errors, &errorHandler,
5975 "unicodeescape", message,
5976 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005977 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005978 goto onError;
5979 }
5980 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005981 WRITECHAR('\\');
5982 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005983 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005984 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005989#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990
Victor Stinner16e6a802011-12-12 13:24:15 +01005991 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005992 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005993 Py_XDECREF(errorHandler);
5994 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005995 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005996
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005998 PyErr_SetString(
5999 PyExc_UnicodeError,
6000 "\\N escapes not supported (can't load unicodedata module)"
6001 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006002 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006003 Py_XDECREF(errorHandler);
6004 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006005 return NULL;
6006
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006009 Py_XDECREF(errorHandler);
6010 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 return NULL;
6012}
6013
6014/* Return a Unicode-Escape string version of the Unicode object.
6015
6016 If quotes is true, the string is enclosed in u"" or u'' quotes as
6017 appropriate.
6018
6019*/
6020
Alexander Belopolsky40018472011-02-26 01:02:56 +00006021PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006022PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006025 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027 int kind;
6028 void *data;
6029 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
Thomas Wouters89f507f2006-12-13 04:49:30 +00006031 /* Initial allocation is based on the longest-possible unichr
6032 escape.
6033
6034 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6035 unichr, so in this case it's the longest unichr escape. In
6036 narrow (UTF-16) builds this is five chars per source unichr
6037 since there are two unichrs in the surrogate pair, so in narrow
6038 (UTF-16) builds it's not the longest unichr escape.
6039
6040 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6041 so in the narrow (UTF-16) build case it's the longest unichr
6042 escape.
6043 */
6044
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006045 if (!PyUnicode_Check(unicode)) {
6046 PyErr_BadArgument();
6047 return NULL;
6048 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006049 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006050 return NULL;
6051 len = PyUnicode_GET_LENGTH(unicode);
6052 kind = PyUnicode_KIND(unicode);
6053 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006054 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006055 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6056 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6057 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6058 }
6059
6060 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006061 return PyBytes_FromStringAndSize(NULL, 0);
6062
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006063 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006065
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006066 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006068 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 if (repr == NULL)
6071 return NULL;
6072
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006073 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006075 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006076 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006077
Walter Dörwald79e913e2007-05-12 11:08:06 +00006078 /* Escape backslashes */
6079 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 *p++ = '\\';
6081 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006082 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006083 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006084
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006085 /* Map 21-bit characters to '\U00xxxxxx' */
6086 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006087 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006088 *p++ = '\\';
6089 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006090 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6091 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6092 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6093 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6094 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6095 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6096 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6097 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006099 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006100
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006102 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 *p++ = '\\';
6104 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006105 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6106 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6107 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6108 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006110
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006111 /* Map special whitespace to '\t', \n', '\r' */
6112 else if (ch == '\t') {
6113 *p++ = '\\';
6114 *p++ = 't';
6115 }
6116 else if (ch == '\n') {
6117 *p++ = '\\';
6118 *p++ = 'n';
6119 }
6120 else if (ch == '\r') {
6121 *p++ = '\\';
6122 *p++ = 'r';
6123 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006124
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006125 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006126 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006128 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006129 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6130 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006131 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 /* Copy everything else as-is */
6134 else
6135 *p++ = (char) ch;
6136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006138 assert(p - PyBytes_AS_STRING(repr) > 0);
6139 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6140 return NULL;
6141 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142}
6143
Alexander Belopolsky40018472011-02-26 01:02:56 +00006144PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006145PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6146 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 PyObject *result;
6149 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6150 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 result = PyUnicode_AsUnicodeEscapeString(tmp);
6153 Py_DECREF(tmp);
6154 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155}
6156
6157/* --- Raw Unicode Escape Codec ------------------------------------------- */
6158
Alexander Belopolsky40018472011-02-26 01:02:56 +00006159PyObject *
6160PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006161 Py_ssize_t size,
6162 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006165 Py_ssize_t startinpos;
6166 Py_ssize_t endinpos;
6167 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006168 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 const char *end;
6170 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 PyObject *errorHandler = NULL;
6172 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006173
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 /* Escaped strings will always be longer than the resulting
6175 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 length after conversion to the true value. (But decoding error
6177 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006178 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006182 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006183 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 end = s + size;
6185 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 unsigned char c;
6187 Py_UCS4 x;
6188 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006189 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* Non-escape characters are interpreted as Unicode ordinals */
6192 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006193 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6194 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 startinpos = s-starts;
6198
6199 /* \u-escapes are only interpreted iff the number of leading
6200 backslashes if odd */
6201 bs = s;
6202 for (;s < end;) {
6203 if (*s != '\\')
6204 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006205 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6206 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 }
6208 if (((s - bs) & 1) == 0 ||
6209 s >= end ||
6210 (*s != 'u' && *s != 'U')) {
6211 continue;
6212 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006213 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 count = *s=='u' ? 4 : 8;
6215 s++;
6216
6217 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 for (x = 0, i = 0; i < count; ++i, ++s) {
6219 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006220 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 endinpos = s-starts;
6222 if (unicode_decode_call_errorhandler(
6223 errors, &errorHandler,
6224 "rawunicodeescape", "truncated \\uXXXX",
6225 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006226 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 goto onError;
6228 goto nextByte;
6229 }
6230 x = (x<<4) & ~0xF;
6231 if (c >= '0' && c <= '9')
6232 x += c - '0';
6233 else if (c >= 'a' && c <= 'f')
6234 x += 10 + c - 'a';
6235 else
6236 x += 10 + c - 'A';
6237 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006238 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006239 if (unicode_putchar(&v, &outpos, x) < 0)
6240 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006241 } else {
6242 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006243 if (unicode_decode_call_errorhandler(
6244 errors, &errorHandler,
6245 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006247 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006249 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 nextByte:
6251 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006253 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006255 Py_XDECREF(errorHandler);
6256 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006257 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006258
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261 Py_XDECREF(errorHandler);
6262 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 return NULL;
6264}
6265
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006266
Alexander Belopolsky40018472011-02-26 01:02:56 +00006267PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006270 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 char *p;
6272 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006273 Py_ssize_t expandsize, pos;
6274 int kind;
6275 void *data;
6276 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278 if (!PyUnicode_Check(unicode)) {
6279 PyErr_BadArgument();
6280 return NULL;
6281 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006282 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006283 return NULL;
6284 kind = PyUnicode_KIND(unicode);
6285 data = PyUnicode_DATA(unicode);
6286 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006287 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6288 bytes, and 1 byte characters 4. */
6289 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006290
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006291 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006293
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006294 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 if (repr == NULL)
6296 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006297 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006298 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006300 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006301 for (pos = 0; pos < len; pos++) {
6302 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 /* Map 32-bit characters to '\Uxxxxxxxx' */
6304 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006305 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006306 *p++ = '\\';
6307 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006308 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6309 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6310 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6311 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6312 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6313 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6314 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6315 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 *p++ = '\\';
6320 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006321 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6322 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6323 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6324 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 /* Copy everything else as-is */
6327 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 *p++ = (char) ch;
6329 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006330
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006331 assert(p > q);
6332 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006333 return NULL;
6334 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006338PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6339 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006341 PyObject *result;
6342 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6343 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006344 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006345 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6346 Py_DECREF(tmp);
6347 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348}
6349
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006350/* --- Unicode Internal Codec ------------------------------------------- */
6351
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352PyObject *
6353_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 Py_ssize_t size,
6355 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006356{
6357 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006358 Py_ssize_t startinpos;
6359 Py_ssize_t endinpos;
6360 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006361 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006362 const char *end;
6363 const char *reason;
6364 PyObject *errorHandler = NULL;
6365 PyObject *exc = NULL;
6366
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006367 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006368 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006369 1))
6370 return NULL;
6371
Thomas Wouters89f507f2006-12-13 04:49:30 +00006372 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006373 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006374 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006376 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006377 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006378 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006379 end = s + size;
6380
6381 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006382 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006383 Py_UCS4 ch;
6384 /* We copy the raw representation one byte at a time because the
6385 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006386 ((char *) &uch)[0] = s[0];
6387 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006388#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006389 ((char *) &uch)[2] = s[2];
6390 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006391#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006392 ch = uch;
6393
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006394 /* We have to sanity check the raw data, otherwise doom looms for
6395 some malformed UCS-4 data. */
6396 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006397#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006398 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006399#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006400 end-s < Py_UNICODE_SIZE
6401 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006403 startinpos = s - starts;
6404 if (end-s < Py_UNICODE_SIZE) {
6405 endinpos = end-starts;
6406 reason = "truncated input";
6407 }
6408 else {
6409 endinpos = s - starts + Py_UNICODE_SIZE;
6410 reason = "illegal code point (> 0x10FFFF)";
6411 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006412 if (unicode_decode_call_errorhandler(
6413 errors, &errorHandler,
6414 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006415 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006416 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006417 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006418 continue;
6419 }
6420
6421 s += Py_UNICODE_SIZE;
6422#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006423 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006424 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006425 Py_UNICODE uch2;
6426 ((char *) &uch2)[0] = s[0];
6427 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006428 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006429 {
Victor Stinner551ac952011-11-29 22:58:13 +01006430 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006431 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006432 }
6433 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006434#endif
6435
6436 if (unicode_putchar(&v, &outpos, ch) < 0)
6437 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006438 }
6439
Victor Stinner16e6a802011-12-12 13:24:15 +01006440 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006441 goto onError;
6442 Py_XDECREF(errorHandler);
6443 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006444 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006445
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006447 Py_XDECREF(v);
6448 Py_XDECREF(errorHandler);
6449 Py_XDECREF(exc);
6450 return NULL;
6451}
6452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453/* --- Latin-1 Codec ------------------------------------------------------ */
6454
Alexander Belopolsky40018472011-02-26 01:02:56 +00006455PyObject *
6456PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006457 Py_ssize_t size,
6458 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006461 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462}
6463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006465static void
6466make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006467 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006468 PyObject *unicode,
6469 Py_ssize_t startpos, Py_ssize_t endpos,
6470 const char *reason)
6471{
6472 if (*exceptionObject == NULL) {
6473 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006475 encoding, unicode, startpos, endpos, reason);
6476 }
6477 else {
6478 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6479 goto onError;
6480 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6481 goto onError;
6482 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6483 goto onError;
6484 return;
6485 onError:
6486 Py_DECREF(*exceptionObject);
6487 *exceptionObject = NULL;
6488 }
6489}
6490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006492static void
6493raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006494 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006495 PyObject *unicode,
6496 Py_ssize_t startpos, Py_ssize_t endpos,
6497 const char *reason)
6498{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006499 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006500 encoding, unicode, startpos, endpos, reason);
6501 if (*exceptionObject != NULL)
6502 PyCodec_StrictErrors(*exceptionObject);
6503}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006504
6505/* error handling callback helper:
6506 build arguments, call the callback and check the arguments,
6507 put the result into newpos and return the replacement string, which
6508 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006509static PyObject *
6510unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006511 PyObject **errorHandler,
6512 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006513 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006514 Py_ssize_t startpos, Py_ssize_t endpos,
6515 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006517 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519 PyObject *restuple;
6520 PyObject *resunicode;
6521
6522 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006524 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 }
6527
Benjamin Petersonbac79492012-01-14 13:34:47 -05006528 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006529 return NULL;
6530 len = PyUnicode_GET_LENGTH(unicode);
6531
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006532 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006534 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536
6537 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006539 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006542 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 Py_DECREF(restuple);
6544 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006545 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006546 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 &resunicode, newpos)) {
6548 Py_DECREF(restuple);
6549 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006551 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6552 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6553 Py_DECREF(restuple);
6554 return NULL;
6555 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006556 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 *newpos = len + *newpos;
6558 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6560 Py_DECREF(restuple);
6561 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006562 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006563 Py_INCREF(resunicode);
6564 Py_DECREF(restuple);
6565 return resunicode;
6566}
6567
Alexander Belopolsky40018472011-02-26 01:02:56 +00006568static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006570 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006571 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006573 /* input state */
6574 Py_ssize_t pos=0, size;
6575 int kind;
6576 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577 /* output object */
6578 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579 /* pointer into the output */
6580 char *str;
6581 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006582 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006583 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6584 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006585 PyObject *errorHandler = NULL;
6586 PyObject *exc = NULL;
6587 /* the following variable is used for caching string comparisons
6588 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6589 int known_errorHandler = -1;
6590
Benjamin Petersonbac79492012-01-14 13:34:47 -05006591 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006592 return NULL;
6593 size = PyUnicode_GET_LENGTH(unicode);
6594 kind = PyUnicode_KIND(unicode);
6595 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596 /* allocate enough for a simple encoding without
6597 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006598 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006599 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006600 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006601 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006602 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006603 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604 ressize = size;
6605
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 while (pos < size) {
6607 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 /* can we encode this? */
6610 if (c<limit) {
6611 /* no overflow check, because we know that the space is enough */
6612 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006614 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 Py_ssize_t requiredsize;
6617 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006618 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 Py_ssize_t collstart = pos;
6621 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 ++collend;
6625 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6626 if (known_errorHandler==-1) {
6627 if ((errors==NULL) || (!strcmp(errors, "strict")))
6628 known_errorHandler = 1;
6629 else if (!strcmp(errors, "replace"))
6630 known_errorHandler = 2;
6631 else if (!strcmp(errors, "ignore"))
6632 known_errorHandler = 3;
6633 else if (!strcmp(errors, "xmlcharrefreplace"))
6634 known_errorHandler = 4;
6635 else
6636 known_errorHandler = 0;
6637 }
6638 switch (known_errorHandler) {
6639 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006640 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 goto onError;
6642 case 2: /* replace */
6643 while (collstart++<collend)
6644 *str++ = '?'; /* fall through */
6645 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 break;
6648 case 4: /* xmlcharrefreplace */
6649 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650 /* determine replacement size */
6651 for (i = collstart, repsize = 0; i < collend; ++i) {
6652 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6653 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006655 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006665 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006666 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 if (requiredsize > ressize) {
6672 if (requiredsize<2*ressize)
6673 requiredsize = 2*ressize;
6674 if (_PyBytes_Resize(&res, requiredsize))
6675 goto onError;
6676 str = PyBytes_AS_STRING(res) + respos;
6677 ressize = requiredsize;
6678 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 /* generate replacement */
6680 for (i = collstart; i < collend; ++i) {
6681 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006683 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 break;
6685 default:
6686 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006687 encoding, reason, unicode, &exc,
6688 collstart, collend, &newpos);
6689 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006690 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006692 if (PyBytes_Check(repunicode)) {
6693 /* Directly copy bytes result to output. */
6694 repsize = PyBytes_Size(repunicode);
6695 if (repsize > 1) {
6696 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006697 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006698 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6699 Py_DECREF(repunicode);
6700 goto onError;
6701 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006702 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006703 ressize += repsize-1;
6704 }
6705 memcpy(str, PyBytes_AsString(repunicode), repsize);
6706 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006707 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006708 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006709 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 /* need more space? (at least enough for what we
6712 have+the replacement+the rest of the string, so
6713 we won't have to check space for encodable characters) */
6714 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715 repsize = PyUnicode_GET_LENGTH(repunicode);
6716 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 if (requiredsize > ressize) {
6718 if (requiredsize<2*ressize)
6719 requiredsize = 2*ressize;
6720 if (_PyBytes_Resize(&res, requiredsize)) {
6721 Py_DECREF(repunicode);
6722 goto onError;
6723 }
6724 str = PyBytes_AS_STRING(res) + respos;
6725 ressize = requiredsize;
6726 }
6727 /* check if there is anything unencodable in the replacement
6728 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006729 for (i = 0; repsize-->0; ++i, ++str) {
6730 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006732 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 Py_DECREF(repunicode);
6735 goto onError;
6736 }
6737 *str = (char)c;
6738 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006739 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006740 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 }
6743 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006744 /* Resize if we allocated to much */
6745 size = str - PyBytes_AS_STRING(res);
6746 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006747 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006748 if (_PyBytes_Resize(&res, size) < 0)
6749 goto onError;
6750 }
6751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 Py_XDECREF(errorHandler);
6753 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006754 return res;
6755
6756 onError:
6757 Py_XDECREF(res);
6758 Py_XDECREF(errorHandler);
6759 Py_XDECREF(exc);
6760 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761}
6762
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006764PyObject *
6765PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006766 Py_ssize_t size,
6767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006769 PyObject *result;
6770 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6771 if (unicode == NULL)
6772 return NULL;
6773 result = unicode_encode_ucs1(unicode, errors, 256);
6774 Py_DECREF(unicode);
6775 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776}
6777
Alexander Belopolsky40018472011-02-26 01:02:56 +00006778PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006779_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780{
6781 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 PyErr_BadArgument();
6783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006785 if (PyUnicode_READY(unicode) == -1)
6786 return NULL;
6787 /* Fast path: if it is a one-byte string, construct
6788 bytes object directly. */
6789 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6790 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6791 PyUnicode_GET_LENGTH(unicode));
6792 /* Non-Latin-1 characters present. Defer to above function to
6793 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006795}
6796
6797PyObject*
6798PyUnicode_AsLatin1String(PyObject *unicode)
6799{
6800 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801}
6802
6803/* --- 7-bit ASCII Codec -------------------------------------------------- */
6804
Alexander Belopolsky40018472011-02-26 01:02:56 +00006805PyObject *
6806PyUnicode_DecodeASCII(const char *s,
6807 Py_ssize_t size,
6808 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006811 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006812 int kind;
6813 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006814 Py_ssize_t startinpos;
6815 Py_ssize_t endinpos;
6816 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006817 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006818 int has_error;
6819 const unsigned char *p = (const unsigned char *)s;
6820 const unsigned char *end = p + size;
6821 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 PyObject *errorHandler = NULL;
6823 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006824
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006825 if (size == 0) {
6826 Py_INCREF(unicode_empty);
6827 return unicode_empty;
6828 }
6829
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006831 if (size == 1 && (unsigned char)s[0] < 128)
6832 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006833
Victor Stinner702c7342011-10-05 13:50:52 +02006834 has_error = 0;
6835 while (p < end && !has_error) {
6836 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6837 an explanation. */
6838 if (!((size_t) p & LONG_PTR_MASK)) {
6839 /* Help register allocation */
6840 register const unsigned char *_p = p;
6841 while (_p < aligned_end) {
6842 unsigned long value = *(unsigned long *) _p;
6843 if (value & ASCII_CHAR_MASK) {
6844 has_error = 1;
6845 break;
6846 }
6847 _p += SIZEOF_LONG;
6848 }
6849 if (_p == end)
6850 break;
6851 if (has_error)
6852 break;
6853 p = _p;
6854 }
6855 if (*p & 0x80) {
6856 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006857 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006858 }
6859 else {
6860 ++p;
6861 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006862 }
Victor Stinner702c7342011-10-05 13:50:52 +02006863 if (!has_error)
6864 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006865
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006866 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006870 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006871 kind = PyUnicode_KIND(v);
6872 data = PyUnicode_DATA(v);
6873 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874 e = s + size;
6875 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 register unsigned char c = (unsigned char)*s;
6877 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006878 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 ++s;
6880 }
6881 else {
6882 startinpos = s-starts;
6883 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 if (unicode_decode_call_errorhandler(
6885 errors, &errorHandler,
6886 "ascii", "ordinal not in range(128)",
6887 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006888 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006890 kind = PyUnicode_KIND(v);
6891 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006894 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006895 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 Py_XDECREF(errorHandler);
6897 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006898 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006899 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006900
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 Py_XDECREF(errorHandler);
6904 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 return NULL;
6906}
6907
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006908/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006909PyObject *
6910PyUnicode_EncodeASCII(const Py_UNICODE *p,
6911 Py_ssize_t size,
6912 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 PyObject *result;
6915 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6916 if (unicode == NULL)
6917 return NULL;
6918 result = unicode_encode_ucs1(unicode, errors, 128);
6919 Py_DECREF(unicode);
6920 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
Alexander Belopolsky40018472011-02-26 01:02:56 +00006923PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006924_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925{
6926 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 PyErr_BadArgument();
6928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006930 if (PyUnicode_READY(unicode) == -1)
6931 return NULL;
6932 /* Fast path: if it is an ASCII-only string, construct bytes object
6933 directly. Else defer to above function to raise the exception. */
6934 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6935 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6936 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006937 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006938}
6939
6940PyObject *
6941PyUnicode_AsASCIIString(PyObject *unicode)
6942{
6943 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944}
6945
Victor Stinner99b95382011-07-04 14:23:54 +02006946#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006947
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006948/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006949
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006950#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006951#define NEED_RETRY
6952#endif
6953
Victor Stinner3a50e702011-10-18 21:21:00 +02006954#ifndef WC_ERR_INVALID_CHARS
6955# define WC_ERR_INVALID_CHARS 0x0080
6956#endif
6957
6958static char*
6959code_page_name(UINT code_page, PyObject **obj)
6960{
6961 *obj = NULL;
6962 if (code_page == CP_ACP)
6963 return "mbcs";
6964 if (code_page == CP_UTF7)
6965 return "CP_UTF7";
6966 if (code_page == CP_UTF8)
6967 return "CP_UTF8";
6968
6969 *obj = PyBytes_FromFormat("cp%u", code_page);
6970 if (*obj == NULL)
6971 return NULL;
6972 return PyBytes_AS_STRING(*obj);
6973}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974
Alexander Belopolsky40018472011-02-26 01:02:56 +00006975static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006976is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006977{
6978 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 if (!IsDBCSLeadByteEx(code_page, *curr))
6982 return 0;
6983
6984 prev = CharPrevExA(code_page, s, curr, 0);
6985 if (prev == curr)
6986 return 1;
6987 /* FIXME: This code is limited to "true" double-byte encodings,
6988 as it assumes an incomplete character consists of a single
6989 byte. */
6990 if (curr - prev == 2)
6991 return 1;
6992 if (!IsDBCSLeadByteEx(code_page, *prev))
6993 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 return 0;
6995}
6996
Victor Stinner3a50e702011-10-18 21:21:00 +02006997static DWORD
6998decode_code_page_flags(UINT code_page)
6999{
7000 if (code_page == CP_UTF7) {
7001 /* The CP_UTF7 decoder only supports flags=0 */
7002 return 0;
7003 }
7004 else
7005 return MB_ERR_INVALID_CHARS;
7006}
7007
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 * Decode a byte string from a Windows code page into unicode object in strict
7010 * mode.
7011 *
7012 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7013 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007015static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007016decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007017 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 const char *in,
7019 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020{
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007022 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007023 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024
7025 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007026 assert(insize > 0);
7027 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7028 if (outsize <= 0)
7029 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030
7031 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007033 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007034 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 if (*v == NULL)
7036 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007037 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038 }
7039 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007041 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007042 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045 }
7046
7047 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007048 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7049 if (outsize <= 0)
7050 goto error;
7051 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007052
Victor Stinner3a50e702011-10-18 21:21:00 +02007053error:
7054 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7055 return -2;
7056 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007057 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058}
7059
Victor Stinner3a50e702011-10-18 21:21:00 +02007060/*
7061 * Decode a byte string from a code page into unicode object with an error
7062 * handler.
7063 *
7064 * Returns consumed size if succeed, or raise a WindowsError or
7065 * UnicodeDecodeError exception and returns -1 on error.
7066 */
7067static int
7068decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007069 PyObject **v,
7070 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 const char *errors)
7072{
7073 const char *startin = in;
7074 const char *endin = in + size;
7075 const DWORD flags = decode_code_page_flags(code_page);
7076 /* Ideally, we should get reason from FormatMessage. This is the Windows
7077 2000 English version of the message. */
7078 const char *reason = "No mapping for the Unicode character exists "
7079 "in the target code page.";
7080 /* each step cannot decode more than 1 character, but a character can be
7081 represented as a surrogate pair */
7082 wchar_t buffer[2], *startout, *out;
7083 int insize, outsize;
7084 PyObject *errorHandler = NULL;
7085 PyObject *exc = NULL;
7086 PyObject *encoding_obj = NULL;
7087 char *encoding;
7088 DWORD err;
7089 int ret = -1;
7090
7091 assert(size > 0);
7092
7093 encoding = code_page_name(code_page, &encoding_obj);
7094 if (encoding == NULL)
7095 return -1;
7096
7097 if (errors == NULL || strcmp(errors, "strict") == 0) {
7098 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7099 UnicodeDecodeError. */
7100 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7101 if (exc != NULL) {
7102 PyCodec_StrictErrors(exc);
7103 Py_CLEAR(exc);
7104 }
7105 goto error;
7106 }
7107
7108 if (*v == NULL) {
7109 /* Create unicode object */
7110 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7111 PyErr_NoMemory();
7112 goto error;
7113 }
Victor Stinnerab595942011-12-17 04:59:06 +01007114 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007115 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 if (*v == NULL)
7117 goto error;
7118 startout = PyUnicode_AS_UNICODE(*v);
7119 }
7120 else {
7121 /* Extend unicode object */
7122 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7123 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7124 PyErr_NoMemory();
7125 goto error;
7126 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007127 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 goto error;
7129 startout = PyUnicode_AS_UNICODE(*v) + n;
7130 }
7131
7132 /* Decode the byte string character per character */
7133 out = startout;
7134 while (in < endin)
7135 {
7136 /* Decode a character */
7137 insize = 1;
7138 do
7139 {
7140 outsize = MultiByteToWideChar(code_page, flags,
7141 in, insize,
7142 buffer, Py_ARRAY_LENGTH(buffer));
7143 if (outsize > 0)
7144 break;
7145 err = GetLastError();
7146 if (err != ERROR_NO_UNICODE_TRANSLATION
7147 && err != ERROR_INSUFFICIENT_BUFFER)
7148 {
7149 PyErr_SetFromWindowsErr(0);
7150 goto error;
7151 }
7152 insize++;
7153 }
7154 /* 4=maximum length of a UTF-8 sequence */
7155 while (insize <= 4 && (in + insize) <= endin);
7156
7157 if (outsize <= 0) {
7158 Py_ssize_t startinpos, endinpos, outpos;
7159
7160 startinpos = in - startin;
7161 endinpos = startinpos + 1;
7162 outpos = out - PyUnicode_AS_UNICODE(*v);
7163 if (unicode_decode_call_errorhandler(
7164 errors, &errorHandler,
7165 encoding, reason,
7166 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007167 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 {
7169 goto error;
7170 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007171 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 }
7173 else {
7174 in += insize;
7175 memcpy(out, buffer, outsize * sizeof(wchar_t));
7176 out += outsize;
7177 }
7178 }
7179
7180 /* write a NUL character at the end */
7181 *out = 0;
7182
7183 /* Extend unicode object */
7184 outsize = out - startout;
7185 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007186 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007188 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007189
7190error:
7191 Py_XDECREF(encoding_obj);
7192 Py_XDECREF(errorHandler);
7193 Py_XDECREF(exc);
7194 return ret;
7195}
7196
Victor Stinner3a50e702011-10-18 21:21:00 +02007197static PyObject *
7198decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007199 const char *s, Py_ssize_t size,
7200 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201{
Victor Stinner76a31a62011-11-04 00:05:13 +01007202 PyObject *v = NULL;
7203 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 if (code_page < 0) {
7206 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7207 return NULL;
7208 }
7209
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007212
Victor Stinner76a31a62011-11-04 00:05:13 +01007213 do
7214 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007215#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007216 if (size > INT_MAX) {
7217 chunk_size = INT_MAX;
7218 final = 0;
7219 done = 0;
7220 }
7221 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007222#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007223 {
7224 chunk_size = (int)size;
7225 final = (consumed == NULL);
7226 done = 1;
7227 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007228
Victor Stinner76a31a62011-11-04 00:05:13 +01007229 /* Skip trailing lead-byte unless 'final' is set */
7230 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7231 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007232
Victor Stinner76a31a62011-11-04 00:05:13 +01007233 if (chunk_size == 0 && done) {
7234 if (v != NULL)
7235 break;
7236 Py_INCREF(unicode_empty);
7237 return unicode_empty;
7238 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007239
Victor Stinner76a31a62011-11-04 00:05:13 +01007240
7241 converted = decode_code_page_strict(code_page, &v,
7242 s, chunk_size);
7243 if (converted == -2)
7244 converted = decode_code_page_errors(code_page, &v,
7245 s, chunk_size,
7246 errors);
7247 assert(converted != 0);
7248
7249 if (converted < 0) {
7250 Py_XDECREF(v);
7251 return NULL;
7252 }
7253
7254 if (consumed)
7255 *consumed += converted;
7256
7257 s += converted;
7258 size -= converted;
7259 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007260
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007261 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007262}
7263
Alexander Belopolsky40018472011-02-26 01:02:56 +00007264PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007265PyUnicode_DecodeCodePageStateful(int code_page,
7266 const char *s,
7267 Py_ssize_t size,
7268 const char *errors,
7269 Py_ssize_t *consumed)
7270{
7271 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7272}
7273
7274PyObject *
7275PyUnicode_DecodeMBCSStateful(const char *s,
7276 Py_ssize_t size,
7277 const char *errors,
7278 Py_ssize_t *consumed)
7279{
7280 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7281}
7282
7283PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007284PyUnicode_DecodeMBCS(const char *s,
7285 Py_ssize_t size,
7286 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007287{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007288 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7289}
7290
Victor Stinner3a50e702011-10-18 21:21:00 +02007291static DWORD
7292encode_code_page_flags(UINT code_page, const char *errors)
7293{
7294 if (code_page == CP_UTF8) {
7295 if (winver.dwMajorVersion >= 6)
7296 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7297 and later */
7298 return WC_ERR_INVALID_CHARS;
7299 else
7300 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7301 return 0;
7302 }
7303 else if (code_page == CP_UTF7) {
7304 /* CP_UTF7 only supports flags=0 */
7305 return 0;
7306 }
7307 else {
7308 if (errors != NULL && strcmp(errors, "replace") == 0)
7309 return 0;
7310 else
7311 return WC_NO_BEST_FIT_CHARS;
7312 }
7313}
7314
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 * Encode a Unicode string to a Windows code page into a byte string in strict
7317 * mode.
7318 *
7319 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7320 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007322static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007323encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007324 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326{
Victor Stinner554f3f02010-06-16 23:33:54 +00007327 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 BOOL *pusedDefaultChar = &usedDefaultChar;
7329 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007330 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007331 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007332 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 const DWORD flags = encode_code_page_flags(code_page, NULL);
7334 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007335 /* Create a substring so that we can get the UTF-16 representation
7336 of just the slice under consideration. */
7337 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Martin v. Löwis3d325192011-11-04 18:23:06 +01007339 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007340
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007342 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007344 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007345
Victor Stinner2fc507f2011-11-04 20:06:39 +01007346 substring = PyUnicode_Substring(unicode, offset, offset+len);
7347 if (substring == NULL)
7348 return -1;
7349 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7350 if (p == NULL) {
7351 Py_DECREF(substring);
7352 return -1;
7353 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007355 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 outsize = WideCharToMultiByte(code_page, flags,
7357 p, size,
7358 NULL, 0,
7359 NULL, pusedDefaultChar);
7360 if (outsize <= 0)
7361 goto error;
7362 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007363 if (pusedDefaultChar && *pusedDefaultChar) {
7364 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007366 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007367
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007371 if (*outbytes == NULL) {
7372 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007374 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007376 }
7377 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007379 const Py_ssize_t n = PyBytes_Size(*outbytes);
7380 if (outsize > PY_SSIZE_T_MAX - n) {
7381 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007382 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007385 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7386 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007388 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390 }
7391
7392 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 outsize = WideCharToMultiByte(code_page, flags,
7394 p, size,
7395 out, outsize,
7396 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007397 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 if (outsize <= 0)
7399 goto error;
7400 if (pusedDefaultChar && *pusedDefaultChar)
7401 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007402 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007403
Victor Stinner3a50e702011-10-18 21:21:00 +02007404error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007405 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7407 return -2;
7408 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007409 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007410}
7411
Victor Stinner3a50e702011-10-18 21:21:00 +02007412/*
7413 * Encode a Unicode string to a Windows code page into a byte string using a
7414 * error handler.
7415 *
7416 * Returns consumed characters if succeed, or raise a WindowsError and returns
7417 * -1 on other error.
7418 */
7419static int
7420encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007421 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007422 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007423{
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007425 Py_ssize_t pos = unicode_offset;
7426 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 /* Ideally, we should get reason from FormatMessage. This is the Windows
7428 2000 English version of the message. */
7429 const char *reason = "invalid character";
7430 /* 4=maximum length of a UTF-8 sequence */
7431 char buffer[4];
7432 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7433 Py_ssize_t outsize;
7434 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 PyObject *errorHandler = NULL;
7436 PyObject *exc = NULL;
7437 PyObject *encoding_obj = NULL;
7438 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007439 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 PyObject *rep;
7441 int ret = -1;
7442
7443 assert(insize > 0);
7444
7445 encoding = code_page_name(code_page, &encoding_obj);
7446 if (encoding == NULL)
7447 return -1;
7448
7449 if (errors == NULL || strcmp(errors, "strict") == 0) {
7450 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7451 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007452 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 if (exc != NULL) {
7454 PyCodec_StrictErrors(exc);
7455 Py_DECREF(exc);
7456 }
7457 Py_XDECREF(encoding_obj);
7458 return -1;
7459 }
7460
7461 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7462 pusedDefaultChar = &usedDefaultChar;
7463 else
7464 pusedDefaultChar = NULL;
7465
7466 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7467 PyErr_NoMemory();
7468 goto error;
7469 }
7470 outsize = insize * Py_ARRAY_LENGTH(buffer);
7471
7472 if (*outbytes == NULL) {
7473 /* Create string object */
7474 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7475 if (*outbytes == NULL)
7476 goto error;
7477 out = PyBytes_AS_STRING(*outbytes);
7478 }
7479 else {
7480 /* Extend string object */
7481 Py_ssize_t n = PyBytes_Size(*outbytes);
7482 if (n > PY_SSIZE_T_MAX - outsize) {
7483 PyErr_NoMemory();
7484 goto error;
7485 }
7486 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7487 goto error;
7488 out = PyBytes_AS_STRING(*outbytes) + n;
7489 }
7490
7491 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007492 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7495 wchar_t chars[2];
7496 int charsize;
7497 if (ch < 0x10000) {
7498 chars[0] = (wchar_t)ch;
7499 charsize = 1;
7500 }
7501 else {
7502 ch -= 0x10000;
7503 chars[0] = 0xd800 + (ch >> 10);
7504 chars[1] = 0xdc00 + (ch & 0x3ff);
7505 charsize = 2;
7506 }
7507
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007509 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 buffer, Py_ARRAY_LENGTH(buffer),
7511 NULL, pusedDefaultChar);
7512 if (outsize > 0) {
7513 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7514 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007515 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 memcpy(out, buffer, outsize);
7517 out += outsize;
7518 continue;
7519 }
7520 }
7521 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7522 PyErr_SetFromWindowsErr(0);
7523 goto error;
7524 }
7525
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 rep = unicode_encode_call_errorhandler(
7527 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 if (rep == NULL)
7531 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007532 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007533
7534 if (PyBytes_Check(rep)) {
7535 outsize = PyBytes_GET_SIZE(rep);
7536 if (outsize != 1) {
7537 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7538 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7539 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7540 Py_DECREF(rep);
7541 goto error;
7542 }
7543 out = PyBytes_AS_STRING(*outbytes) + offset;
7544 }
7545 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7546 out += outsize;
7547 }
7548 else {
7549 Py_ssize_t i;
7550 enum PyUnicode_Kind kind;
7551 void *data;
7552
Benjamin Petersonbac79492012-01-14 13:34:47 -05007553 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 Py_DECREF(rep);
7555 goto error;
7556 }
7557
7558 outsize = PyUnicode_GET_LENGTH(rep);
7559 if (outsize != 1) {
7560 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7561 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7562 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7563 Py_DECREF(rep);
7564 goto error;
7565 }
7566 out = PyBytes_AS_STRING(*outbytes) + offset;
7567 }
7568 kind = PyUnicode_KIND(rep);
7569 data = PyUnicode_DATA(rep);
7570 for (i=0; i < outsize; i++) {
7571 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7572 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007573 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007574 encoding, unicode,
7575 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 "unable to encode error handler result to ASCII");
7577 Py_DECREF(rep);
7578 goto error;
7579 }
7580 *out = (unsigned char)ch;
7581 out++;
7582 }
7583 }
7584 Py_DECREF(rep);
7585 }
7586 /* write a NUL byte */
7587 *out = 0;
7588 outsize = out - PyBytes_AS_STRING(*outbytes);
7589 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7590 if (_PyBytes_Resize(outbytes, outsize) < 0)
7591 goto error;
7592 ret = 0;
7593
7594error:
7595 Py_XDECREF(encoding_obj);
7596 Py_XDECREF(errorHandler);
7597 Py_XDECREF(exc);
7598 return ret;
7599}
7600
Victor Stinner3a50e702011-10-18 21:21:00 +02007601static PyObject *
7602encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007603 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 const char *errors)
7605{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007608 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007609 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007610
Benjamin Petersonbac79492012-01-14 13:34:47 -05007611 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007612 return NULL;
7613 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007614
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 if (code_page < 0) {
7616 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7617 return NULL;
7618 }
7619
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007621 return PyBytes_FromStringAndSize(NULL, 0);
7622
Victor Stinner7581cef2011-11-03 22:32:33 +01007623 offset = 0;
7624 do
7625 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007626#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007627 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007628 chunks. */
7629 if (len > INT_MAX/2) {
7630 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007631 done = 0;
7632 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007633 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007634#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007635 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007636 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007637 done = 1;
7638 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007639
Victor Stinner76a31a62011-11-04 00:05:13 +01007640 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007641 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007642 errors);
7643 if (ret == -2)
7644 ret = encode_code_page_errors(code_page, &outbytes,
7645 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007647 if (ret < 0) {
7648 Py_XDECREF(outbytes);
7649 return NULL;
7650 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007651
Victor Stinner7581cef2011-11-03 22:32:33 +01007652 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007653 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007654 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007655
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 return outbytes;
7657}
7658
7659PyObject *
7660PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7661 Py_ssize_t size,
7662 const char *errors)
7663{
Victor Stinner7581cef2011-11-03 22:32:33 +01007664 PyObject *unicode, *res;
7665 unicode = PyUnicode_FromUnicode(p, size);
7666 if (unicode == NULL)
7667 return NULL;
7668 res = encode_code_page(CP_ACP, unicode, errors);
7669 Py_DECREF(unicode);
7670 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007671}
7672
7673PyObject *
7674PyUnicode_EncodeCodePage(int code_page,
7675 PyObject *unicode,
7676 const char *errors)
7677{
Victor Stinner7581cef2011-11-03 22:32:33 +01007678 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007679}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007680
Alexander Belopolsky40018472011-02-26 01:02:56 +00007681PyObject *
7682PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007683{
7684 if (!PyUnicode_Check(unicode)) {
7685 PyErr_BadArgument();
7686 return NULL;
7687 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007688 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007689}
7690
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007691#undef NEED_RETRY
7692
Victor Stinner99b95382011-07-04 14:23:54 +02007693#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007694
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695/* --- Character Mapping Codec -------------------------------------------- */
7696
Alexander Belopolsky40018472011-02-26 01:02:56 +00007697PyObject *
7698PyUnicode_DecodeCharmap(const char *s,
7699 Py_ssize_t size,
7700 PyObject *mapping,
7701 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007704 Py_ssize_t startinpos;
7705 Py_ssize_t endinpos;
7706 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007708 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007709 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 PyObject *errorHandler = NULL;
7711 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007712
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 /* Default to Latin-1 */
7714 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007717 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007721 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007722 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007723 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007724 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007725 Py_ssize_t maplen;
7726 enum PyUnicode_Kind kind;
7727 void *data;
7728 Py_UCS4 x;
7729
Benjamin Petersonbac79492012-01-14 13:34:47 -05007730 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007731 return NULL;
7732
7733 maplen = PyUnicode_GET_LENGTH(mapping);
7734 data = PyUnicode_DATA(mapping);
7735 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 while (s < e) {
7737 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007740 x = PyUnicode_READ(kind, data, ch);
7741 else
7742 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007744 if (x == 0xfffe)
7745 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 startinpos = s-starts;
7748 endinpos = startinpos+1;
7749 if (unicode_decode_call_errorhandler(
7750 errors, &errorHandler,
7751 "charmap", "character maps to <undefined>",
7752 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007753 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 goto onError;
7755 }
7756 continue;
7757 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007758
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007759 if (unicode_putchar(&v, &outpos, x) < 0)
7760 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007762 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007763 }
7764 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 while (s < e) {
7766 unsigned char ch = *s;
7767 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007768
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7770 w = PyLong_FromLong((long)ch);
7771 if (w == NULL)
7772 goto onError;
7773 x = PyObject_GetItem(mapping, w);
7774 Py_DECREF(w);
7775 if (x == NULL) {
7776 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7777 /* No mapping found means: mapping is undefined. */
7778 PyErr_Clear();
7779 x = Py_None;
7780 Py_INCREF(x);
7781 } else
7782 goto onError;
7783 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 /* Apply mapping */
7786 if (PyLong_Check(x)) {
7787 long value = PyLong_AS_LONG(x);
7788 if (value < 0 || value > 65535) {
7789 PyErr_SetString(PyExc_TypeError,
7790 "character mapping must be in range(65536)");
7791 Py_DECREF(x);
7792 goto onError;
7793 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007794 if (unicode_putchar(&v, &outpos, value) < 0)
7795 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 }
7797 else if (x == Py_None) {
7798 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 startinpos = s-starts;
7800 endinpos = startinpos+1;
7801 if (unicode_decode_call_errorhandler(
7802 errors, &errorHandler,
7803 "charmap", "character maps to <undefined>",
7804 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007805 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 Py_DECREF(x);
7807 goto onError;
7808 }
7809 Py_DECREF(x);
7810 continue;
7811 }
7812 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007813 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007814
Benjamin Petersonbac79492012-01-14 13:34:47 -05007815 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007816 goto onError;
7817 targetsize = PyUnicode_GET_LENGTH(x);
7818
7819 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007821 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007822 PyUnicode_READ_CHAR(x, 0)) < 0)
7823 goto onError;
7824 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 else if (targetsize > 1) {
7826 /* 1-n mapping */
7827 if (targetsize > extrachars) {
7828 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 Py_ssize_t needed = (targetsize - extrachars) + \
7830 (targetsize << 2);
7831 extrachars += needed;
7832 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007833 if (unicode_resize(&v,
7834 PyUnicode_GET_LENGTH(v) + needed) < 0)
7835 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 Py_DECREF(x);
7837 goto onError;
7838 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007840 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7841 goto onError;
7842 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7843 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 extrachars -= targetsize;
7845 }
7846 /* 1-0 mapping: skip the character */
7847 }
7848 else {
7849 /* wrong return value */
7850 PyErr_SetString(PyExc_TypeError,
7851 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007852 Py_DECREF(x);
7853 goto onError;
7854 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 Py_DECREF(x);
7856 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007859 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007860 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007861 Py_XDECREF(errorHandler);
7862 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007863 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007864
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866 Py_XDECREF(errorHandler);
7867 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 Py_XDECREF(v);
7869 return NULL;
7870}
7871
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007872/* Charmap encoding: the lookup table */
7873
Alexander Belopolsky40018472011-02-26 01:02:56 +00007874struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 PyObject_HEAD
7876 unsigned char level1[32];
7877 int count2, count3;
7878 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879};
7880
7881static PyObject*
7882encoding_map_size(PyObject *obj, PyObject* args)
7883{
7884 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007885 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887}
7888
7889static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007890 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 PyDoc_STR("Return the size (in bytes) of this object") },
7892 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893};
7894
7895static void
7896encoding_map_dealloc(PyObject* o)
7897{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007898 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899}
7900
7901static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007902 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 "EncodingMap", /*tp_name*/
7904 sizeof(struct encoding_map), /*tp_basicsize*/
7905 0, /*tp_itemsize*/
7906 /* methods */
7907 encoding_map_dealloc, /*tp_dealloc*/
7908 0, /*tp_print*/
7909 0, /*tp_getattr*/
7910 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007911 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 0, /*tp_repr*/
7913 0, /*tp_as_number*/
7914 0, /*tp_as_sequence*/
7915 0, /*tp_as_mapping*/
7916 0, /*tp_hash*/
7917 0, /*tp_call*/
7918 0, /*tp_str*/
7919 0, /*tp_getattro*/
7920 0, /*tp_setattro*/
7921 0, /*tp_as_buffer*/
7922 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7923 0, /*tp_doc*/
7924 0, /*tp_traverse*/
7925 0, /*tp_clear*/
7926 0, /*tp_richcompare*/
7927 0, /*tp_weaklistoffset*/
7928 0, /*tp_iter*/
7929 0, /*tp_iternext*/
7930 encoding_map_methods, /*tp_methods*/
7931 0, /*tp_members*/
7932 0, /*tp_getset*/
7933 0, /*tp_base*/
7934 0, /*tp_dict*/
7935 0, /*tp_descr_get*/
7936 0, /*tp_descr_set*/
7937 0, /*tp_dictoffset*/
7938 0, /*tp_init*/
7939 0, /*tp_alloc*/
7940 0, /*tp_new*/
7941 0, /*tp_free*/
7942 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943};
7944
7945PyObject*
7946PyUnicode_BuildEncodingMap(PyObject* string)
7947{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007948 PyObject *result;
7949 struct encoding_map *mresult;
7950 int i;
7951 int need_dict = 0;
7952 unsigned char level1[32];
7953 unsigned char level2[512];
7954 unsigned char *mlevel1, *mlevel2, *mlevel3;
7955 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 int kind;
7957 void *data;
7958 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007960 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007961 PyErr_BadArgument();
7962 return NULL;
7963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007964 kind = PyUnicode_KIND(string);
7965 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007966 memset(level1, 0xFF, sizeof level1);
7967 memset(level2, 0xFF, sizeof level2);
7968
7969 /* If there isn't a one-to-one mapping of NULL to \0,
7970 or if there are non-BMP characters, we need to use
7971 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973 need_dict = 1;
7974 for (i = 1; i < 256; i++) {
7975 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007976 ch = PyUnicode_READ(kind, data, i);
7977 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007978 need_dict = 1;
7979 break;
7980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007982 /* unmapped character */
7983 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984 l1 = ch >> 11;
7985 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007986 if (level1[l1] == 0xFF)
7987 level1[l1] = count2++;
7988 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007989 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007990 }
7991
7992 if (count2 >= 0xFF || count3 >= 0xFF)
7993 need_dict = 1;
7994
7995 if (need_dict) {
7996 PyObject *result = PyDict_New();
7997 PyObject *key, *value;
7998 if (!result)
7999 return NULL;
8000 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008001 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008002 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003 if (!key || !value)
8004 goto failed1;
8005 if (PyDict_SetItem(result, key, value) == -1)
8006 goto failed1;
8007 Py_DECREF(key);
8008 Py_DECREF(value);
8009 }
8010 return result;
8011 failed1:
8012 Py_XDECREF(key);
8013 Py_XDECREF(value);
8014 Py_DECREF(result);
8015 return NULL;
8016 }
8017
8018 /* Create a three-level trie */
8019 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8020 16*count2 + 128*count3 - 1);
8021 if (!result)
8022 return PyErr_NoMemory();
8023 PyObject_Init(result, &EncodingMapType);
8024 mresult = (struct encoding_map*)result;
8025 mresult->count2 = count2;
8026 mresult->count3 = count3;
8027 mlevel1 = mresult->level1;
8028 mlevel2 = mresult->level23;
8029 mlevel3 = mresult->level23 + 16*count2;
8030 memcpy(mlevel1, level1, 32);
8031 memset(mlevel2, 0xFF, 16*count2);
8032 memset(mlevel3, 0, 128*count3);
8033 count3 = 0;
8034 for (i = 1; i < 256; i++) {
8035 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037 /* unmapped character */
8038 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008039 o1 = PyUnicode_READ(kind, data, i)>>11;
8040 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041 i2 = 16*mlevel1[o1] + o2;
8042 if (mlevel2[i2] == 0xFF)
8043 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008044 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045 i3 = 128*mlevel2[i2] + o3;
8046 mlevel3[i3] = i;
8047 }
8048 return result;
8049}
8050
8051static int
Victor Stinner22168992011-11-20 17:09:18 +01008052encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053{
8054 struct encoding_map *map = (struct encoding_map*)mapping;
8055 int l1 = c>>11;
8056 int l2 = (c>>7) & 0xF;
8057 int l3 = c & 0x7F;
8058 int i;
8059
Victor Stinner22168992011-11-20 17:09:18 +01008060 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062 if (c == 0)
8063 return 0;
8064 /* level 1*/
8065 i = map->level1[l1];
8066 if (i == 0xFF) {
8067 return -1;
8068 }
8069 /* level 2*/
8070 i = map->level23[16*i+l2];
8071 if (i == 0xFF) {
8072 return -1;
8073 }
8074 /* level 3 */
8075 i = map->level23[16*map->count2 + 128*i + l3];
8076 if (i == 0) {
8077 return -1;
8078 }
8079 return i;
8080}
8081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082/* Lookup the character ch in the mapping. If the character
8083 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008084 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008085static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008086charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087{
Christian Heimes217cfd12007-12-02 14:31:20 +00008088 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 PyObject *x;
8090
8091 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093 x = PyObject_GetItem(mapping, w);
8094 Py_DECREF(w);
8095 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8097 /* No mapping found means: mapping is undefined. */
8098 PyErr_Clear();
8099 x = Py_None;
8100 Py_INCREF(x);
8101 return x;
8102 } else
8103 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008105 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008107 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 long value = PyLong_AS_LONG(x);
8109 if (value < 0 || value > 255) {
8110 PyErr_SetString(PyExc_TypeError,
8111 "character mapping must be in range(256)");
8112 Py_DECREF(x);
8113 return NULL;
8114 }
8115 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008117 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 /* wrong return value */
8121 PyErr_Format(PyExc_TypeError,
8122 "character mapping must return integer, bytes or None, not %.400s",
8123 x->ob_type->tp_name);
8124 Py_DECREF(x);
8125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 }
8127}
8128
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008130charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008132 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8133 /* exponentially overallocate to minimize reallocations */
8134 if (requiredsize < 2*outsize)
8135 requiredsize = 2*outsize;
8136 if (_PyBytes_Resize(outobj, requiredsize))
8137 return -1;
8138 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139}
8140
Benjamin Peterson14339b62009-01-31 16:36:08 +00008141typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008143} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008145 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 space is available. Return a new reference to the object that
8147 was put in the output buffer, or Py_None, if the mapping was undefined
8148 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008149 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008150static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008151charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008152 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 PyObject *rep;
8155 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008156 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008157
Christian Heimes90aa7642007-12-19 02:45:37 +00008158 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 if (res == -1)
8162 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 if (outsize<requiredsize)
8164 if (charmapencode_resize(outobj, outpos, requiredsize))
8165 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008166 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 outstart[(*outpos)++] = (char)res;
8168 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 }
8170
8171 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008172 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 Py_DECREF(rep);
8176 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 if (PyLong_Check(rep)) {
8179 Py_ssize_t requiredsize = *outpos+1;
8180 if (outsize<requiredsize)
8181 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8182 Py_DECREF(rep);
8183 return enc_EXCEPTION;
8184 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008185 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 else {
8189 const char *repchars = PyBytes_AS_STRING(rep);
8190 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8191 Py_ssize_t requiredsize = *outpos+repsize;
8192 if (outsize<requiredsize)
8193 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8194 Py_DECREF(rep);
8195 return enc_EXCEPTION;
8196 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008197 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 memcpy(outstart + *outpos, repchars, repsize);
8199 *outpos += repsize;
8200 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008201 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008202 Py_DECREF(rep);
8203 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008204}
8205
8206/* handle an error in PyUnicode_EncodeCharmap
8207 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008208static int
8209charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008210 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008212 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008213 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214{
8215 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008216 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008217 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008218 enum PyUnicode_Kind kind;
8219 void *data;
8220 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008222 Py_ssize_t collstartpos = *inpos;
8223 Py_ssize_t collendpos = *inpos+1;
8224 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225 char *encoding = "charmap";
8226 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008227 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008228 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008229 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008230
Benjamin Petersonbac79492012-01-14 13:34:47 -05008231 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008232 return -1;
8233 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 /* find all unencodable characters */
8235 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008236 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008237 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008238 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008239 val = encoding_map_lookup(ch, mapping);
8240 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 break;
8242 ++collendpos;
8243 continue;
8244 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008245
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008246 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8247 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 if (rep==NULL)
8249 return -1;
8250 else if (rep!=Py_None) {
8251 Py_DECREF(rep);
8252 break;
8253 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008254 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256 }
8257 /* cache callback name lookup
8258 * (if not done yet, i.e. it's the first error) */
8259 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 if ((errors==NULL) || (!strcmp(errors, "strict")))
8261 *known_errorHandler = 1;
8262 else if (!strcmp(errors, "replace"))
8263 *known_errorHandler = 2;
8264 else if (!strcmp(errors, "ignore"))
8265 *known_errorHandler = 3;
8266 else if (!strcmp(errors, "xmlcharrefreplace"))
8267 *known_errorHandler = 4;
8268 else
8269 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 }
8271 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008273 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008274 return -1;
8275 case 2: /* replace */
8276 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 x = charmapencode_output('?', mapping, res, respos);
8278 if (x==enc_EXCEPTION) {
8279 return -1;
8280 }
8281 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008282 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 return -1;
8284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008285 }
8286 /* fall through */
8287 case 3: /* ignore */
8288 *inpos = collendpos;
8289 break;
8290 case 4: /* xmlcharrefreplace */
8291 /* generate replacement (temporarily (mis)uses p) */
8292 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 char buffer[2+29+1+1];
8294 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008295 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 for (cp = buffer; *cp; ++cp) {
8297 x = charmapencode_output(*cp, mapping, res, respos);
8298 if (x==enc_EXCEPTION)
8299 return -1;
8300 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008301 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 return -1;
8303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008304 }
8305 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 *inpos = collendpos;
8307 break;
8308 default:
8309 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008310 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008312 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008314 if (PyBytes_Check(repunicode)) {
8315 /* Directly copy bytes result to output. */
8316 Py_ssize_t outsize = PyBytes_Size(*res);
8317 Py_ssize_t requiredsize;
8318 repsize = PyBytes_Size(repunicode);
8319 requiredsize = *respos + repsize;
8320 if (requiredsize > outsize)
8321 /* Make room for all additional bytes. */
8322 if (charmapencode_resize(res, respos, requiredsize)) {
8323 Py_DECREF(repunicode);
8324 return -1;
8325 }
8326 memcpy(PyBytes_AsString(*res) + *respos,
8327 PyBytes_AsString(repunicode), repsize);
8328 *respos += repsize;
8329 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008330 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008331 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008333 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008334 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008335 Py_DECREF(repunicode);
8336 return -1;
8337 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008338 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008339 data = PyUnicode_DATA(repunicode);
8340 kind = PyUnicode_KIND(repunicode);
8341 for (index = 0; index < repsize; index++) {
8342 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8343 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008345 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 return -1;
8347 }
8348 else if (x==enc_FAILED) {
8349 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008350 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 return -1;
8352 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008353 }
8354 *inpos = newpos;
8355 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356 }
8357 return 0;
8358}
8359
Alexander Belopolsky40018472011-02-26 01:02:56 +00008360PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008361_PyUnicode_EncodeCharmap(PyObject *unicode,
8362 PyObject *mapping,
8363 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 /* output object */
8366 PyObject *res = NULL;
8367 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008368 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008369 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008371 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 PyObject *errorHandler = NULL;
8373 PyObject *exc = NULL;
8374 /* the following variable is used for caching string comparisons
8375 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8376 * 3=ignore, 4=xmlcharrefreplace */
8377 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378
Benjamin Petersonbac79492012-01-14 13:34:47 -05008379 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008380 return NULL;
8381 size = PyUnicode_GET_LENGTH(unicode);
8382
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 /* Default to Latin-1 */
8384 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008385 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 /* allocate enough for a simple encoding without
8388 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008389 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 if (res == NULL)
8391 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008392 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008396 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008398 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 if (x==enc_EXCEPTION) /* error */
8400 goto onError;
8401 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 &exc,
8404 &known_errorHandler, &errorHandler, errors,
8405 &res, &respos)) {
8406 goto onError;
8407 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 else
8410 /* done with this character => adjust input position */
8411 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008415 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008416 if (_PyBytes_Resize(&res, respos) < 0)
8417 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 Py_XDECREF(exc);
8420 Py_XDECREF(errorHandler);
8421 return res;
8422
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 Py_XDECREF(res);
8425 Py_XDECREF(exc);
8426 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 return NULL;
8428}
8429
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008430/* Deprecated */
8431PyObject *
8432PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8433 Py_ssize_t size,
8434 PyObject *mapping,
8435 const char *errors)
8436{
8437 PyObject *result;
8438 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8439 if (unicode == NULL)
8440 return NULL;
8441 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8442 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008443 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008444}
8445
Alexander Belopolsky40018472011-02-26 01:02:56 +00008446PyObject *
8447PyUnicode_AsCharmapString(PyObject *unicode,
8448 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449{
8450 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 PyErr_BadArgument();
8452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008454 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455}
8456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008458static void
8459make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008461 Py_ssize_t startpos, Py_ssize_t endpos,
8462 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 *exceptionObject = _PyUnicodeTranslateError_Create(
8466 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 }
8468 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8470 goto onError;
8471 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8472 goto onError;
8473 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8474 goto onError;
8475 return;
8476 onError:
8477 Py_DECREF(*exceptionObject);
8478 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 }
8480}
8481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008483static void
8484raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008486 Py_ssize_t startpos, Py_ssize_t endpos,
8487 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488{
8489 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008491 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493}
8494
8495/* error handling callback helper:
8496 build arguments, call the callback and check the arguments,
8497 put the result into newpos and return the replacement string, which
8498 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008499static PyObject *
8500unicode_translate_call_errorhandler(const char *errors,
8501 PyObject **errorHandler,
8502 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008504 Py_ssize_t startpos, Py_ssize_t endpos,
8505 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008507 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008509 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510 PyObject *restuple;
8511 PyObject *resunicode;
8512
8513 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 }
8518
8519 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523
8524 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008529 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 Py_DECREF(restuple);
8531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 }
8533 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 &resunicode, &i_newpos)) {
8535 Py_DECREF(restuple);
8536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 else
8541 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8544 Py_DECREF(restuple);
8545 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 Py_INCREF(resunicode);
8548 Py_DECREF(restuple);
8549 return resunicode;
8550}
8551
8552/* Lookup the character ch in the mapping and put the result in result,
8553 which must be decrefed by the caller.
8554 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008555static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557{
Christian Heimes217cfd12007-12-02 14:31:20 +00008558 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 PyObject *x;
8560
8561 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 x = PyObject_GetItem(mapping, w);
8564 Py_DECREF(w);
8565 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8567 /* No mapping found means: use 1:1 mapping. */
8568 PyErr_Clear();
8569 *result = NULL;
8570 return 0;
8571 } else
8572 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 }
8574 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 *result = x;
8576 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008578 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 long value = PyLong_AS_LONG(x);
8580 long max = PyUnicode_GetMax();
8581 if (value < 0 || value > max) {
8582 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008583 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 Py_DECREF(x);
8585 return -1;
8586 }
8587 *result = x;
8588 return 0;
8589 }
8590 else if (PyUnicode_Check(x)) {
8591 *result = x;
8592 return 0;
8593 }
8594 else {
8595 /* wrong return value */
8596 PyErr_SetString(PyExc_TypeError,
8597 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008598 Py_DECREF(x);
8599 return -1;
8600 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601}
8602/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 if not reallocate and adjust various state variables.
8604 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008605static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008610 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 /* exponentially overallocate to minimize reallocations */
8612 if (requiredsize < 2 * oldsize)
8613 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8615 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 }
8619 return 0;
8620}
8621/* lookup the character, put the result in the output string and adjust
8622 various state variables. Return a new reference to the object that
8623 was put in the output buffer in *result, or Py_None, if the mapping was
8624 undefined (in which case no character was written).
8625 The called must decref result.
8626 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008627static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8629 PyObject *mapping, Py_UCS4 **output,
8630 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8634 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 }
8640 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008642 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 }
8646 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 Py_ssize_t repsize;
8648 if (PyUnicode_READY(*res) == -1)
8649 return -1;
8650 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 if (repsize==1) {
8652 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 }
8655 else if (repsize!=0) {
8656 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 Py_ssize_t requiredsize = *opos +
8658 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 Py_ssize_t i;
8661 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 for(i = 0; i < repsize; i++)
8664 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 }
8667 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 return 0;
8670}
8671
Alexander Belopolsky40018472011-02-26 01:02:56 +00008672PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673_PyUnicode_TranslateCharmap(PyObject *input,
8674 PyObject *mapping,
8675 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 /* input object */
8678 char *idata;
8679 Py_ssize_t size, i;
8680 int kind;
8681 /* output buffer */
8682 Py_UCS4 *output = NULL;
8683 Py_ssize_t osize;
8684 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 char *reason = "character maps to <undefined>";
8688 PyObject *errorHandler = NULL;
8689 PyObject *exc = NULL;
8690 /* the following variable is used for caching string comparisons
8691 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8692 * 3=ignore, 4=xmlcharrefreplace */
8693 int known_errorHandler = -1;
8694
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 PyErr_BadArgument();
8697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 if (PyUnicode_READY(input) == -1)
8701 return NULL;
8702 idata = (char*)PyUnicode_DATA(input);
8703 kind = PyUnicode_KIND(input);
8704 size = PyUnicode_GET_LENGTH(input);
8705 i = 0;
8706
8707 if (size == 0) {
8708 Py_INCREF(input);
8709 return input;
8710 }
8711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 /* allocate enough for a simple 1:1 translation without
8713 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 osize = size;
8715 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8716 opos = 0;
8717 if (output == NULL) {
8718 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 /* try to encode it */
8724 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 if (charmaptranslate_output(input, i, mapping,
8726 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 Py_XDECREF(x);
8728 goto onError;
8729 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008730 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 else { /* untranslatable character */
8734 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8735 Py_ssize_t repsize;
8736 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 Py_ssize_t collstart = i;
8740 Py_ssize_t collend = i+1;
8741 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744 while (collend < size) {
8745 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 goto onError;
8747 Py_XDECREF(x);
8748 if (x!=Py_None)
8749 break;
8750 ++collend;
8751 }
8752 /* cache callback name lookup
8753 * (if not done yet, i.e. it's the first error) */
8754 if (known_errorHandler==-1) {
8755 if ((errors==NULL) || (!strcmp(errors, "strict")))
8756 known_errorHandler = 1;
8757 else if (!strcmp(errors, "replace"))
8758 known_errorHandler = 2;
8759 else if (!strcmp(errors, "ignore"))
8760 known_errorHandler = 3;
8761 else if (!strcmp(errors, "xmlcharrefreplace"))
8762 known_errorHandler = 4;
8763 else
8764 known_errorHandler = 0;
8765 }
8766 switch (known_errorHandler) {
8767 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 raise_translate_exception(&exc, input, collstart,
8769 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008770 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 case 2: /* replace */
8772 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 for (coll = collstart; coll<collend; coll++)
8774 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 /* fall through */
8776 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 break;
8779 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 /* generate replacement (temporarily (mis)uses i) */
8781 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 char buffer[2+29+1+1];
8783 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8785 if (charmaptranslate_makespace(&output, &osize,
8786 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 goto onError;
8788 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 break;
8793 default:
8794 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 reason, input, &exc,
8796 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008797 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008799 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008800 Py_DECREF(repunicode);
8801 goto onError;
8802 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 repsize = PyUnicode_GET_LENGTH(repunicode);
8805 if (charmaptranslate_makespace(&output, &osize,
8806 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008807 Py_DECREF(repunicode);
8808 goto onError;
8809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 for (uni2 = 0; repsize-->0; ++uni2)
8811 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8812 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008814 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008815 }
8816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8818 if (!res)
8819 goto onError;
8820 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008821 Py_XDECREF(exc);
8822 Py_XDECREF(errorHandler);
8823 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008827 Py_XDECREF(exc);
8828 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 return NULL;
8830}
8831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832/* Deprecated. Use PyUnicode_Translate instead. */
8833PyObject *
8834PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8835 Py_ssize_t size,
8836 PyObject *mapping,
8837 const char *errors)
8838{
8839 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8840 if (!unicode)
8841 return NULL;
8842 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8843}
8844
Alexander Belopolsky40018472011-02-26 01:02:56 +00008845PyObject *
8846PyUnicode_Translate(PyObject *str,
8847 PyObject *mapping,
8848 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849{
8850 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008851
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 str = PyUnicode_FromObject(str);
8853 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856 Py_DECREF(str);
8857 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008858
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860 Py_XDECREF(str);
8861 return NULL;
8862}
Tim Petersced69f82003-09-16 20:30:58 +00008863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008865fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866{
8867 /* No need to call PyUnicode_READY(self) because this function is only
8868 called as a callback from fixup() which does it already. */
8869 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8870 const int kind = PyUnicode_KIND(self);
8871 void *data = PyUnicode_DATA(self);
8872 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008873 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 Py_ssize_t i;
8875
8876 for (i = 0; i < len; ++i) {
8877 ch = PyUnicode_READ(kind, data, i);
8878 fixed = 0;
8879 if (ch > 127) {
8880 if (Py_UNICODE_ISSPACE(ch))
8881 fixed = ' ';
8882 else {
8883 const int decimal = Py_UNICODE_TODECIMAL(ch);
8884 if (decimal >= 0)
8885 fixed = '0' + decimal;
8886 }
8887 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008888 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 if (fixed > maxchar)
8890 maxchar = fixed;
8891 PyUnicode_WRITE(kind, data, i, fixed);
8892 }
8893 else if (ch > maxchar)
8894 maxchar = ch;
8895 }
8896 else if (ch > maxchar)
8897 maxchar = ch;
8898 }
8899
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008900 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901}
8902
8903PyObject *
8904_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8905{
8906 if (!PyUnicode_Check(unicode)) {
8907 PyErr_BadInternalCall();
8908 return NULL;
8909 }
8910 if (PyUnicode_READY(unicode) == -1)
8911 return NULL;
8912 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8913 /* If the string is already ASCII, just return the same string */
8914 Py_INCREF(unicode);
8915 return unicode;
8916 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008917 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918}
8919
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008920PyObject *
8921PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8922 Py_ssize_t length)
8923{
Victor Stinnerf0124502011-11-21 23:12:56 +01008924 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008925 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008926 Py_UCS4 maxchar;
8927 enum PyUnicode_Kind kind;
8928 void *data;
8929
8930 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008931 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008932 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008933 if (ch > 127) {
8934 int decimal = Py_UNICODE_TODECIMAL(ch);
8935 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008936 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008937 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008938 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008939 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008940
8941 /* Copy to a new string */
8942 decimal = PyUnicode_New(length, maxchar);
8943 if (decimal == NULL)
8944 return decimal;
8945 kind = PyUnicode_KIND(decimal);
8946 data = PyUnicode_DATA(decimal);
8947 /* Iterate over code points */
8948 for (i = 0; i < length; i++) {
8949 Py_UNICODE ch = s[i];
8950 if (ch > 127) {
8951 int decimal = Py_UNICODE_TODECIMAL(ch);
8952 if (decimal >= 0)
8953 ch = '0' + decimal;
8954 }
8955 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008957 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008958}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008959/* --- Decimal Encoder ---------------------------------------------------- */
8960
Alexander Belopolsky40018472011-02-26 01:02:56 +00008961int
8962PyUnicode_EncodeDecimal(Py_UNICODE *s,
8963 Py_ssize_t length,
8964 char *output,
8965 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008966{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008967 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008968 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008969 enum PyUnicode_Kind kind;
8970 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008971
8972 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 PyErr_BadArgument();
8974 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008975 }
8976
Victor Stinner42bf7752011-11-21 22:52:58 +01008977 unicode = PyUnicode_FromUnicode(s, length);
8978 if (unicode == NULL)
8979 return -1;
8980
Benjamin Petersonbac79492012-01-14 13:34:47 -05008981 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008982 Py_DECREF(unicode);
8983 return -1;
8984 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008985 kind = PyUnicode_KIND(unicode);
8986 data = PyUnicode_DATA(unicode);
8987
Victor Stinnerb84d7232011-11-22 01:50:07 +01008988 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008989 PyObject *exc;
8990 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008992 Py_ssize_t startpos;
8993
8994 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008995
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008997 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008998 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009000 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 decimal = Py_UNICODE_TODECIMAL(ch);
9002 if (decimal >= 0) {
9003 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009004 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 continue;
9006 }
9007 if (0 < ch && ch < 256) {
9008 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009009 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 continue;
9011 }
Victor Stinner6345be92011-11-25 20:09:01 +01009012
Victor Stinner42bf7752011-11-21 22:52:58 +01009013 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009014 exc = NULL;
9015 raise_encode_exception(&exc, "decimal", unicode,
9016 startpos, startpos+1,
9017 "invalid decimal Unicode string");
9018 Py_XDECREF(exc);
9019 Py_DECREF(unicode);
9020 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009021 }
9022 /* 0-terminate the output string */
9023 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009024 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009025 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009026}
9027
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028/* --- Helpers ------------------------------------------------------------ */
9029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009031any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 Py_ssize_t start,
9033 Py_ssize_t end)
9034{
9035 int kind1, kind2, kind;
9036 void *buf1, *buf2;
9037 Py_ssize_t len1, len2, result;
9038
9039 kind1 = PyUnicode_KIND(s1);
9040 kind2 = PyUnicode_KIND(s2);
9041 kind = kind1 > kind2 ? kind1 : kind2;
9042 buf1 = PyUnicode_DATA(s1);
9043 buf2 = PyUnicode_DATA(s2);
9044 if (kind1 != kind)
9045 buf1 = _PyUnicode_AsKind(s1, kind);
9046 if (!buf1)
9047 return -2;
9048 if (kind2 != kind)
9049 buf2 = _PyUnicode_AsKind(s2, kind);
9050 if (!buf2) {
9051 if (kind1 != kind) PyMem_Free(buf1);
9052 return -2;
9053 }
9054 len1 = PyUnicode_GET_LENGTH(s1);
9055 len2 = PyUnicode_GET_LENGTH(s2);
9056
Victor Stinner794d5672011-10-10 03:21:36 +02009057 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009058 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009059 case PyUnicode_1BYTE_KIND:
9060 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9061 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9062 else
9063 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9064 break;
9065 case PyUnicode_2BYTE_KIND:
9066 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9067 break;
9068 case PyUnicode_4BYTE_KIND:
9069 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9070 break;
9071 default:
9072 assert(0); result = -2;
9073 }
9074 }
9075 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009076 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009077 case PyUnicode_1BYTE_KIND:
9078 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9079 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9080 else
9081 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9082 break;
9083 case PyUnicode_2BYTE_KIND:
9084 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9085 break;
9086 case PyUnicode_4BYTE_KIND:
9087 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9088 break;
9089 default:
9090 assert(0); result = -2;
9091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 }
9093
9094 if (kind1 != kind)
9095 PyMem_Free(buf1);
9096 if (kind2 != kind)
9097 PyMem_Free(buf2);
9098
9099 return result;
9100}
9101
9102Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009103_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 Py_ssize_t n_buffer,
9105 void *digits, Py_ssize_t n_digits,
9106 Py_ssize_t min_width,
9107 const char *grouping,
9108 const char *thousands_sep)
9109{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009110 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009112 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9113 return _PyUnicode_ascii_InsertThousandsGrouping(
9114 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9115 min_width, grouping, thousands_sep);
9116 else
9117 return _PyUnicode_ucs1_InsertThousandsGrouping(
9118 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9119 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 case PyUnicode_2BYTE_KIND:
9121 return _PyUnicode_ucs2_InsertThousandsGrouping(
9122 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9123 min_width, grouping, thousands_sep);
9124 case PyUnicode_4BYTE_KIND:
9125 return _PyUnicode_ucs4_InsertThousandsGrouping(
9126 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9127 min_width, grouping, thousands_sep);
9128 }
9129 assert(0);
9130 return -1;
9131}
9132
9133
Thomas Wouters477c8d52006-05-27 19:21:47 +00009134/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009135#define ADJUST_INDICES(start, end, len) \
9136 if (end > len) \
9137 end = len; \
9138 else if (end < 0) { \
9139 end += len; \
9140 if (end < 0) \
9141 end = 0; \
9142 } \
9143 if (start < 0) { \
9144 start += len; \
9145 if (start < 0) \
9146 start = 0; \
9147 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009148
Alexander Belopolsky40018472011-02-26 01:02:56 +00009149Py_ssize_t
9150PyUnicode_Count(PyObject *str,
9151 PyObject *substr,
9152 Py_ssize_t start,
9153 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009155 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009156 PyObject* str_obj;
9157 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 int kind1, kind2, kind;
9159 void *buf1 = NULL, *buf2 = NULL;
9160 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009161
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009162 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009163 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009165 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009166 if (!sub_obj) {
9167 Py_DECREF(str_obj);
9168 return -1;
9169 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009170 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009171 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 Py_DECREF(str_obj);
9173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174 }
Tim Petersced69f82003-09-16 20:30:58 +00009175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 kind1 = PyUnicode_KIND(str_obj);
9177 kind2 = PyUnicode_KIND(sub_obj);
9178 kind = kind1 > kind2 ? kind1 : kind2;
9179 buf1 = PyUnicode_DATA(str_obj);
9180 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009181 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 if (!buf1)
9183 goto onError;
9184 buf2 = PyUnicode_DATA(sub_obj);
9185 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009186 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 if (!buf2)
9188 goto onError;
9189 len1 = PyUnicode_GET_LENGTH(str_obj);
9190 len2 = PyUnicode_GET_LENGTH(sub_obj);
9191
9192 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009193 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009195 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9196 result = asciilib_count(
9197 ((Py_UCS1*)buf1) + start, end - start,
9198 buf2, len2, PY_SSIZE_T_MAX
9199 );
9200 else
9201 result = ucs1lib_count(
9202 ((Py_UCS1*)buf1) + start, end - start,
9203 buf2, len2, PY_SSIZE_T_MAX
9204 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 break;
9206 case PyUnicode_2BYTE_KIND:
9207 result = ucs2lib_count(
9208 ((Py_UCS2*)buf1) + start, end - start,
9209 buf2, len2, PY_SSIZE_T_MAX
9210 );
9211 break;
9212 case PyUnicode_4BYTE_KIND:
9213 result = ucs4lib_count(
9214 ((Py_UCS4*)buf1) + start, end - start,
9215 buf2, len2, PY_SSIZE_T_MAX
9216 );
9217 break;
9218 default:
9219 assert(0); result = 0;
9220 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009221
9222 Py_DECREF(sub_obj);
9223 Py_DECREF(str_obj);
9224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 if (kind1 != kind)
9226 PyMem_Free(buf1);
9227 if (kind2 != kind)
9228 PyMem_Free(buf2);
9229
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 onError:
9232 Py_DECREF(sub_obj);
9233 Py_DECREF(str_obj);
9234 if (kind1 != kind && buf1)
9235 PyMem_Free(buf1);
9236 if (kind2 != kind && buf2)
9237 PyMem_Free(buf2);
9238 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239}
9240
Alexander Belopolsky40018472011-02-26 01:02:56 +00009241Py_ssize_t
9242PyUnicode_Find(PyObject *str,
9243 PyObject *sub,
9244 Py_ssize_t start,
9245 Py_ssize_t end,
9246 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009248 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009249
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009251 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009252 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009253 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009254 if (!sub) {
9255 Py_DECREF(str);
9256 return -2;
9257 }
9258 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9259 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009260 Py_DECREF(str);
9261 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 }
Tim Petersced69f82003-09-16 20:30:58 +00009263
Victor Stinner794d5672011-10-10 03:21:36 +02009264 result = any_find_slice(direction,
9265 str, sub, start, end
9266 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009267
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009269 Py_DECREF(sub);
9270
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271 return result;
9272}
9273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274Py_ssize_t
9275PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9276 Py_ssize_t start, Py_ssize_t end,
9277 int direction)
9278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009280 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 if (PyUnicode_READY(str) == -1)
9282 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009283 if (start < 0 || end < 0) {
9284 PyErr_SetString(PyExc_IndexError, "string index out of range");
9285 return -2;
9286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 if (end > PyUnicode_GET_LENGTH(str))
9288 end = PyUnicode_GET_LENGTH(str);
9289 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009290 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9291 kind, end-start, ch, direction);
9292 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009294 else
9295 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296}
9297
Alexander Belopolsky40018472011-02-26 01:02:56 +00009298static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009299tailmatch(PyObject *self,
9300 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009301 Py_ssize_t start,
9302 Py_ssize_t end,
9303 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 int kind_self;
9306 int kind_sub;
9307 void *data_self;
9308 void *data_sub;
9309 Py_ssize_t offset;
9310 Py_ssize_t i;
9311 Py_ssize_t end_sub;
9312
9313 if (PyUnicode_READY(self) == -1 ||
9314 PyUnicode_READY(substring) == -1)
9315 return 0;
9316
9317 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 return 1;
9319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9321 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 kind_self = PyUnicode_KIND(self);
9326 data_self = PyUnicode_DATA(self);
9327 kind_sub = PyUnicode_KIND(substring);
9328 data_sub = PyUnicode_DATA(substring);
9329 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9330
9331 if (direction > 0)
9332 offset = end;
9333 else
9334 offset = start;
9335
9336 if (PyUnicode_READ(kind_self, data_self, offset) ==
9337 PyUnicode_READ(kind_sub, data_sub, 0) &&
9338 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9339 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9340 /* If both are of the same kind, memcmp is sufficient */
9341 if (kind_self == kind_sub) {
9342 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009343 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 data_sub,
9345 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009346 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 }
9348 /* otherwise we have to compare each character by first accesing it */
9349 else {
9350 /* We do not need to compare 0 and len(substring)-1 because
9351 the if statement above ensured already that they are equal
9352 when we end up here. */
9353 // TODO: honor direction and do a forward or backwards search
9354 for (i = 1; i < end_sub; ++i) {
9355 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9356 PyUnicode_READ(kind_sub, data_sub, i))
9357 return 0;
9358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 }
9362
9363 return 0;
9364}
9365
Alexander Belopolsky40018472011-02-26 01:02:56 +00009366Py_ssize_t
9367PyUnicode_Tailmatch(PyObject *str,
9368 PyObject *substr,
9369 Py_ssize_t start,
9370 Py_ssize_t end,
9371 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009373 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009374
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 str = PyUnicode_FromObject(str);
9376 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009377 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 substr = PyUnicode_FromObject(substr);
9379 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 Py_DECREF(str);
9381 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 }
Tim Petersced69f82003-09-16 20:30:58 +00009383
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009384 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 Py_DECREF(str);
9387 Py_DECREF(substr);
9388 return result;
9389}
9390
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391/* Apply fixfct filter to the Unicode object self and return a
9392 reference to the modified object */
9393
Alexander Belopolsky40018472011-02-26 01:02:56 +00009394static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009395fixup(PyObject *self,
9396 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 PyObject *u;
9399 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009400 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009402 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009404 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009405 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 /* fix functions return the new maximum character in a string,
9408 if the kind of the resulting unicode object does not change,
9409 everything is fine. Otherwise we need to change the string kind
9410 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009411 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009412
9413 if (maxchar_new == 0) {
9414 /* no changes */;
9415 if (PyUnicode_CheckExact(self)) {
9416 Py_DECREF(u);
9417 Py_INCREF(self);
9418 return self;
9419 }
9420 else
9421 return u;
9422 }
9423
9424 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 maxchar_new = 127;
9426 else if (maxchar_new <= 255)
9427 maxchar_new = 255;
9428 else if (maxchar_new <= 65535)
9429 maxchar_new = 65535;
9430 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009431 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432
Victor Stinnereaab6042011-12-11 22:22:39 +01009433 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009435
9436 /* In case the maximum character changed, we need to
9437 convert the string to the new category. */
9438 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9439 if (v == NULL) {
9440 Py_DECREF(u);
9441 return NULL;
9442 }
9443 if (maxchar_new > maxchar_old) {
9444 /* If the maxchar increased so that the kind changed, not all
9445 characters are representable anymore and we need to fix the
9446 string again. This only happens in very few cases. */
9447 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9448 maxchar_old = fixfct(v);
9449 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 }
9451 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009452 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009454 Py_DECREF(u);
9455 assert(_PyUnicode_CheckConsistency(v, 1));
9456 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457}
9458
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009459static PyObject *
9460ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009462 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9463 char *resdata, *data = PyUnicode_DATA(self);
9464 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009465
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009466 res = PyUnicode_New(len, 127);
9467 if (res == NULL)
9468 return NULL;
9469 resdata = PyUnicode_DATA(res);
9470 if (lower)
9471 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009473 _Py_bytes_upper(resdata, data, len);
9474 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009478handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480 Py_ssize_t j;
9481 int final_sigma;
9482 Py_UCS4 c;
9483 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009484
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009485 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9486
9487 where ! is a negation and \p{xxx} is a character with property xxx.
9488 */
9489 for (j = i - 1; j >= 0; j--) {
9490 c = PyUnicode_READ(kind, data, j);
9491 if (!_PyUnicode_IsCaseIgnorable(c))
9492 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009494 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9495 if (final_sigma) {
9496 for (j = i + 1; j < length; j++) {
9497 c = PyUnicode_READ(kind, data, j);
9498 if (!_PyUnicode_IsCaseIgnorable(c))
9499 break;
9500 }
9501 final_sigma = j == length || !_PyUnicode_IsCased(c);
9502 }
9503 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504}
9505
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009506static int
9507lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9508 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009510 /* Obscure special case. */
9511 if (c == 0x3A3) {
9512 mapped[0] = handle_capital_sigma(kind, data, length, i);
9513 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009515 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516}
9517
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009518static Py_ssize_t
9519do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009521 Py_ssize_t i, k = 0;
9522 int n_res, j;
9523 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009524
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009525 c = PyUnicode_READ(kind, data, 0);
9526 n_res = _PyUnicode_ToUpperFull(c, mapped);
9527 for (j = 0; j < n_res; j++) {
9528 if (mapped[j] > *maxchar)
9529 *maxchar = mapped[j];
9530 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009532 for (i = 1; i < length; i++) {
9533 c = PyUnicode_READ(kind, data, i);
9534 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9535 for (j = 0; j < n_res; j++) {
9536 if (mapped[j] > *maxchar)
9537 *maxchar = mapped[j];
9538 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009539 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009540 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009541 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542}
9543
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009544static Py_ssize_t
9545do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9546 Py_ssize_t i, k = 0;
9547
9548 for (i = 0; i < length; i++) {
9549 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9550 int n_res, j;
9551 if (Py_UNICODE_ISUPPER(c)) {
9552 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9553 }
9554 else if (Py_UNICODE_ISLOWER(c)) {
9555 n_res = _PyUnicode_ToUpperFull(c, mapped);
9556 }
9557 else {
9558 n_res = 1;
9559 mapped[0] = c;
9560 }
9561 for (j = 0; j < n_res; j++) {
9562 if (mapped[j] > *maxchar)
9563 *maxchar = mapped[j];
9564 res[k++] = mapped[j];
9565 }
9566 }
9567 return k;
9568}
9569
9570static Py_ssize_t
9571do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9572 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009574 Py_ssize_t i, k = 0;
9575
9576 for (i = 0; i < length; i++) {
9577 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9578 int n_res, j;
9579 if (lower)
9580 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9581 else
9582 n_res = _PyUnicode_ToUpperFull(c, mapped);
9583 for (j = 0; j < n_res; j++) {
9584 if (mapped[j] > *maxchar)
9585 *maxchar = mapped[j];
9586 res[k++] = mapped[j];
9587 }
9588 }
9589 return k;
9590}
9591
9592static Py_ssize_t
9593do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9594{
9595 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9596}
9597
9598static Py_ssize_t
9599do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9600{
9601 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9602}
9603
Benjamin Petersone51757f2012-01-12 21:10:29 -05009604static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009605do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9606{
9607 Py_ssize_t i, k = 0;
9608
9609 for (i = 0; i < length; i++) {
9610 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9611 Py_UCS4 mapped[3];
9612 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9613 for (j = 0; j < n_res; j++) {
9614 if (mapped[j] > *maxchar)
9615 *maxchar = mapped[j];
9616 res[k++] = mapped[j];
9617 }
9618 }
9619 return k;
9620}
9621
9622static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009623do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9624{
9625 Py_ssize_t i, k = 0;
9626 int previous_is_cased;
9627
9628 previous_is_cased = 0;
9629 for (i = 0; i < length; i++) {
9630 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9631 Py_UCS4 mapped[3];
9632 int n_res, j;
9633
9634 if (previous_is_cased)
9635 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9636 else
9637 n_res = _PyUnicode_ToTitleFull(c, mapped);
9638
9639 for (j = 0; j < n_res; j++) {
9640 if (mapped[j] > *maxchar)
9641 *maxchar = mapped[j];
9642 res[k++] = mapped[j];
9643 }
9644
9645 previous_is_cased = _PyUnicode_IsCased(c);
9646 }
9647 return k;
9648}
9649
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650static PyObject *
9651case_operation(PyObject *self,
9652 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9653{
9654 PyObject *res = NULL;
9655 Py_ssize_t length, newlength = 0;
9656 int kind, outkind;
9657 void *data, *outdata;
9658 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9659
Benjamin Petersoneea48462012-01-16 14:28:50 -05009660 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661
9662 kind = PyUnicode_KIND(self);
9663 data = PyUnicode_DATA(self);
9664 length = PyUnicode_GET_LENGTH(self);
9665 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9666 if (tmp == NULL)
9667 return PyErr_NoMemory();
9668 newlength = perform(kind, data, length, tmp, &maxchar);
9669 res = PyUnicode_New(newlength, maxchar);
9670 if (res == NULL)
9671 goto leave;
9672 tmpend = tmp + newlength;
9673 outdata = PyUnicode_DATA(res);
9674 outkind = PyUnicode_KIND(res);
9675 switch (outkind) {
9676 case PyUnicode_1BYTE_KIND:
9677 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9678 break;
9679 case PyUnicode_2BYTE_KIND:
9680 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9681 break;
9682 case PyUnicode_4BYTE_KIND:
9683 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9684 break;
9685 default:
9686 assert(0);
9687 break;
9688 }
9689 leave:
9690 PyMem_FREE(tmp);
9691 return res;
9692}
9693
Tim Peters8ce9f162004-08-27 01:49:32 +00009694PyObject *
9695PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009698 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009700 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009701 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9702 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009703 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009705 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009707 int use_memcpy;
9708 unsigned char *res_data = NULL, *sep_data = NULL;
9709 PyObject *last_obj;
9710 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711
Tim Peters05eba1f2004-08-27 21:32:02 +00009712 fseq = PySequence_Fast(seq, "");
9713 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009714 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009715 }
9716
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009717 /* NOTE: the following code can't call back into Python code,
9718 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009719 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009720
Tim Peters05eba1f2004-08-27 21:32:02 +00009721 seqlen = PySequence_Fast_GET_SIZE(fseq);
9722 /* If empty sequence, return u"". */
9723 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009724 Py_DECREF(fseq);
9725 Py_INCREF(unicode_empty);
9726 res = unicode_empty;
9727 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009728 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009729
Tim Peters05eba1f2004-08-27 21:32:02 +00009730 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009731 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009732 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009733 if (seqlen == 1) {
9734 if (PyUnicode_CheckExact(items[0])) {
9735 res = items[0];
9736 Py_INCREF(res);
9737 Py_DECREF(fseq);
9738 return res;
9739 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009740 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009741 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009742 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009743 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009744 /* Set up sep and seplen */
9745 if (separator == NULL) {
9746 /* fall back to a blank space separator */
9747 sep = PyUnicode_FromOrdinal(' ');
9748 if (!sep)
9749 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009750 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009751 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009752 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009753 else {
9754 if (!PyUnicode_Check(separator)) {
9755 PyErr_Format(PyExc_TypeError,
9756 "separator: expected str instance,"
9757 " %.80s found",
9758 Py_TYPE(separator)->tp_name);
9759 goto onError;
9760 }
9761 if (PyUnicode_READY(separator))
9762 goto onError;
9763 sep = separator;
9764 seplen = PyUnicode_GET_LENGTH(separator);
9765 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9766 /* inc refcount to keep this code path symmetric with the
9767 above case of a blank separator */
9768 Py_INCREF(sep);
9769 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009770 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009771 }
9772
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009773 /* There are at least two things to join, or else we have a subclass
9774 * of str in the sequence.
9775 * Do a pre-pass to figure out the total amount of space we'll
9776 * need (sz), and see whether all argument are strings.
9777 */
9778 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009779#ifdef Py_DEBUG
9780 use_memcpy = 0;
9781#else
9782 use_memcpy = 1;
9783#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009784 for (i = 0; i < seqlen; i++) {
9785 const Py_ssize_t old_sz = sz;
9786 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009787 if (!PyUnicode_Check(item)) {
9788 PyErr_Format(PyExc_TypeError,
9789 "sequence item %zd: expected str instance,"
9790 " %.80s found",
9791 i, Py_TYPE(item)->tp_name);
9792 goto onError;
9793 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 if (PyUnicode_READY(item) == -1)
9795 goto onError;
9796 sz += PyUnicode_GET_LENGTH(item);
9797 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009798 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009799 if (i != 0)
9800 sz += seplen;
9801 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9802 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009803 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009804 goto onError;
9805 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009806 if (use_memcpy && last_obj != NULL) {
9807 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9808 use_memcpy = 0;
9809 }
9810 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009811 }
Tim Petersced69f82003-09-16 20:30:58 +00009812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009814 if (res == NULL)
9815 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009816
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009817 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009818#ifdef Py_DEBUG
9819 use_memcpy = 0;
9820#else
9821 if (use_memcpy) {
9822 res_data = PyUnicode_1BYTE_DATA(res);
9823 kind = PyUnicode_KIND(res);
9824 if (seplen != 0)
9825 sep_data = PyUnicode_1BYTE_DATA(sep);
9826 }
9827#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009829 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009830 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009831 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009832 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009833 if (use_memcpy) {
9834 Py_MEMCPY(res_data,
9835 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009836 kind * seplen);
9837 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009838 }
9839 else {
9840 copy_characters(res, res_offset, sep, 0, seplen);
9841 res_offset += seplen;
9842 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009843 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009844 itemlen = PyUnicode_GET_LENGTH(item);
9845 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009846 if (use_memcpy) {
9847 Py_MEMCPY(res_data,
9848 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009849 kind * itemlen);
9850 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009851 }
9852 else {
9853 copy_characters(res, res_offset, item, 0, itemlen);
9854 res_offset += itemlen;
9855 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009856 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009857 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009858 if (use_memcpy)
9859 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009860 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009861 else
9862 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009863
Tim Peters05eba1f2004-08-27 21:32:02 +00009864 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009866 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868
Benjamin Peterson29060642009-01-31 22:14:21 +00009869 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009870 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009872 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 return NULL;
9874}
9875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876#define FILL(kind, data, value, start, length) \
9877 do { \
9878 Py_ssize_t i_ = 0; \
9879 assert(kind != PyUnicode_WCHAR_KIND); \
9880 switch ((kind)) { \
9881 case PyUnicode_1BYTE_KIND: { \
9882 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9883 memset(to_, (unsigned char)value, length); \
9884 break; \
9885 } \
9886 case PyUnicode_2BYTE_KIND: { \
9887 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9888 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9889 break; \
9890 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009891 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9893 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9894 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009895 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 } \
9897 } \
9898 } while (0)
9899
Victor Stinner3fe55312012-01-04 00:33:50 +01009900Py_ssize_t
9901PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9902 Py_UCS4 fill_char)
9903{
9904 Py_ssize_t maxlen;
9905 enum PyUnicode_Kind kind;
9906 void *data;
9907
9908 if (!PyUnicode_Check(unicode)) {
9909 PyErr_BadInternalCall();
9910 return -1;
9911 }
9912 if (PyUnicode_READY(unicode) == -1)
9913 return -1;
9914 if (unicode_check_modifiable(unicode))
9915 return -1;
9916
9917 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9918 PyErr_SetString(PyExc_ValueError,
9919 "fill character is bigger than "
9920 "the string maximum character");
9921 return -1;
9922 }
9923
9924 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9925 length = Py_MIN(maxlen, length);
9926 if (length <= 0)
9927 return 0;
9928
9929 kind = PyUnicode_KIND(unicode);
9930 data = PyUnicode_DATA(unicode);
9931 FILL(kind, data, fill_char, start, length);
9932 return length;
9933}
9934
Victor Stinner9310abb2011-10-05 00:59:23 +02009935static PyObject *
9936pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009937 Py_ssize_t left,
9938 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 PyObject *u;
9942 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009943 int kind;
9944 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945
9946 if (left < 0)
9947 left = 0;
9948 if (right < 0)
9949 right = 0;
9950
Victor Stinnerc4b49542011-12-11 22:44:26 +01009951 if (left == 0 && right == 0)
9952 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9955 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009956 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9957 return NULL;
9958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9960 if (fill > maxchar)
9961 maxchar = fill;
9962 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009963 if (!u)
9964 return NULL;
9965
9966 kind = PyUnicode_KIND(u);
9967 data = PyUnicode_DATA(u);
9968 if (left)
9969 FILL(kind, data, fill, 0, left);
9970 if (right)
9971 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009972 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009973 assert(_PyUnicode_CheckConsistency(u, 1));
9974 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977
Alexander Belopolsky40018472011-02-26 01:02:56 +00009978PyObject *
9979PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982
9983 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009984 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009986 if (PyUnicode_READY(string) == -1) {
9987 Py_DECREF(string);
9988 return NULL;
9989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990
Benjamin Petersonead6b532011-12-20 17:23:42 -06009991 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009993 if (PyUnicode_IS_ASCII(string))
9994 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009995 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009996 PyUnicode_GET_LENGTH(string), keepends);
9997 else
9998 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009999 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010000 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 break;
10002 case PyUnicode_2BYTE_KIND:
10003 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010004 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 PyUnicode_GET_LENGTH(string), keepends);
10006 break;
10007 case PyUnicode_4BYTE_KIND:
10008 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010009 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 PyUnicode_GET_LENGTH(string), keepends);
10011 break;
10012 default:
10013 assert(0);
10014 list = 0;
10015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 Py_DECREF(string);
10017 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018}
10019
Alexander Belopolsky40018472011-02-26 01:02:56 +000010020static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010021split(PyObject *self,
10022 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010023 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 int kind1, kind2, kind;
10026 void *buf1, *buf2;
10027 Py_ssize_t len1, len2;
10028 PyObject* out;
10029
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010031 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 if (PyUnicode_READY(self) == -1)
10034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010037 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010039 if (PyUnicode_IS_ASCII(self))
10040 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010041 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010042 PyUnicode_GET_LENGTH(self), maxcount
10043 );
10044 else
10045 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010046 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010047 PyUnicode_GET_LENGTH(self), maxcount
10048 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 case PyUnicode_2BYTE_KIND:
10050 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010051 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 PyUnicode_GET_LENGTH(self), maxcount
10053 );
10054 case PyUnicode_4BYTE_KIND:
10055 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010056 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 PyUnicode_GET_LENGTH(self), maxcount
10058 );
10059 default:
10060 assert(0);
10061 return NULL;
10062 }
10063
10064 if (PyUnicode_READY(substring) == -1)
10065 return NULL;
10066
10067 kind1 = PyUnicode_KIND(self);
10068 kind2 = PyUnicode_KIND(substring);
10069 kind = kind1 > kind2 ? kind1 : kind2;
10070 buf1 = PyUnicode_DATA(self);
10071 buf2 = PyUnicode_DATA(substring);
10072 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010073 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 if (!buf1)
10075 return NULL;
10076 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010077 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 if (!buf2) {
10079 if (kind1 != kind) PyMem_Free(buf1);
10080 return NULL;
10081 }
10082 len1 = PyUnicode_GET_LENGTH(self);
10083 len2 = PyUnicode_GET_LENGTH(substring);
10084
Benjamin Petersonead6b532011-12-20 17:23:42 -060010085 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010087 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10088 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010089 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010090 else
10091 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010092 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 break;
10094 case PyUnicode_2BYTE_KIND:
10095 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010096 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 break;
10098 case PyUnicode_4BYTE_KIND:
10099 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010100 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 break;
10102 default:
10103 out = NULL;
10104 }
10105 if (kind1 != kind)
10106 PyMem_Free(buf1);
10107 if (kind2 != kind)
10108 PyMem_Free(buf2);
10109 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110}
10111
Alexander Belopolsky40018472011-02-26 01:02:56 +000010112static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010113rsplit(PyObject *self,
10114 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010115 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 int kind1, kind2, kind;
10118 void *buf1, *buf2;
10119 Py_ssize_t len1, len2;
10120 PyObject* out;
10121
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010122 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010123 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 if (PyUnicode_READY(self) == -1)
10126 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010129 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010131 if (PyUnicode_IS_ASCII(self))
10132 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010133 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010134 PyUnicode_GET_LENGTH(self), maxcount
10135 );
10136 else
10137 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010138 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010139 PyUnicode_GET_LENGTH(self), maxcount
10140 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 case PyUnicode_2BYTE_KIND:
10142 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010143 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 PyUnicode_GET_LENGTH(self), maxcount
10145 );
10146 case PyUnicode_4BYTE_KIND:
10147 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010148 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 PyUnicode_GET_LENGTH(self), maxcount
10150 );
10151 default:
10152 assert(0);
10153 return NULL;
10154 }
10155
10156 if (PyUnicode_READY(substring) == -1)
10157 return NULL;
10158
10159 kind1 = PyUnicode_KIND(self);
10160 kind2 = PyUnicode_KIND(substring);
10161 kind = kind1 > kind2 ? kind1 : kind2;
10162 buf1 = PyUnicode_DATA(self);
10163 buf2 = PyUnicode_DATA(substring);
10164 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010165 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 if (!buf1)
10167 return NULL;
10168 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010169 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 if (!buf2) {
10171 if (kind1 != kind) PyMem_Free(buf1);
10172 return NULL;
10173 }
10174 len1 = PyUnicode_GET_LENGTH(self);
10175 len2 = PyUnicode_GET_LENGTH(substring);
10176
Benjamin Petersonead6b532011-12-20 17:23:42 -060010177 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010179 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10180 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010181 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 else
10183 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010184 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 break;
10186 case PyUnicode_2BYTE_KIND:
10187 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 break;
10190 case PyUnicode_4BYTE_KIND:
10191 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 break;
10194 default:
10195 out = NULL;
10196 }
10197 if (kind1 != kind)
10198 PyMem_Free(buf1);
10199 if (kind2 != kind)
10200 PyMem_Free(buf2);
10201 return out;
10202}
10203
10204static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010205anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10206 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010208 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10211 return asciilib_find(buf1, len1, buf2, len2, offset);
10212 else
10213 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 case PyUnicode_2BYTE_KIND:
10215 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10216 case PyUnicode_4BYTE_KIND:
10217 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10218 }
10219 assert(0);
10220 return -1;
10221}
10222
10223static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10225 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010227 switch (kind) {
10228 case PyUnicode_1BYTE_KIND:
10229 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10230 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10231 else
10232 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10233 case PyUnicode_2BYTE_KIND:
10234 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10235 case PyUnicode_4BYTE_KIND:
10236 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10237 }
10238 assert(0);
10239 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010240}
10241
Alexander Belopolsky40018472011-02-26 01:02:56 +000010242static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243replace(PyObject *self, PyObject *str1,
10244 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 PyObject *u;
10247 char *sbuf = PyUnicode_DATA(self);
10248 char *buf1 = PyUnicode_DATA(str1);
10249 char *buf2 = PyUnicode_DATA(str2);
10250 int srelease = 0, release1 = 0, release2 = 0;
10251 int skind = PyUnicode_KIND(self);
10252 int kind1 = PyUnicode_KIND(str1);
10253 int kind2 = PyUnicode_KIND(str2);
10254 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10255 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10256 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010257 int mayshrink;
10258 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259
10260 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010261 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010263 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
Victor Stinner59de0ee2011-10-07 10:01:28 +020010265 if (str1 == str2)
10266 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (skind < kind1)
10268 /* substring too wide to be present */
10269 goto nothing;
10270
Victor Stinner49a0a212011-10-12 23:46:10 +020010271 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10272 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10273 /* Replacing str1 with str2 may cause a maxchar reduction in the
10274 result string. */
10275 mayshrink = (maxchar_str2 < maxchar);
10276 maxchar = Py_MAX(maxchar, maxchar_str2);
10277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010281 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010283 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010284 Py_UCS4 u1, u2;
10285 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010286 Py_ssize_t index, pos;
10287 char *src;
10288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010290 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10291 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010292 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010297 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010299
10300 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10301 index = 0;
10302 src = sbuf;
10303 while (--maxcount)
10304 {
10305 pos++;
10306 src += pos * PyUnicode_KIND(self);
10307 slen -= pos;
10308 index += pos;
10309 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10310 if (pos < 0)
10311 break;
10312 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10313 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010314 }
10315 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 int rkind = skind;
10317 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010318 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 if (kind1 < rkind) {
10321 /* widen substring */
10322 buf1 = _PyUnicode_AsKind(str1, rkind);
10323 if (!buf1) goto error;
10324 release1 = 1;
10325 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010326 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010327 if (i < 0)
10328 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (rkind > kind2) {
10330 /* widen replacement */
10331 buf2 = _PyUnicode_AsKind(str2, rkind);
10332 if (!buf2) goto error;
10333 release2 = 1;
10334 }
10335 else if (rkind < kind2) {
10336 /* widen self and buf1 */
10337 rkind = kind2;
10338 if (release1) PyMem_Free(buf1);
10339 sbuf = _PyUnicode_AsKind(self, rkind);
10340 if (!sbuf) goto error;
10341 srelease = 1;
10342 buf1 = _PyUnicode_AsKind(str1, rkind);
10343 if (!buf1) goto error;
10344 release1 = 1;
10345 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010346 u = PyUnicode_New(slen, maxchar);
10347 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010349 assert(PyUnicode_KIND(u) == rkind);
10350 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010351
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010352 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010353 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010356 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010358
10359 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010360 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010361 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010362 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010363 if (i == -1)
10364 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010365 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010367 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010371 }
10372 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 Py_ssize_t n, i, j, ires;
10374 Py_ssize_t product, new_size;
10375 int rkind = skind;
10376 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010379 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 buf1 = _PyUnicode_AsKind(str1, rkind);
10381 if (!buf1) goto error;
10382 release1 = 1;
10383 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010384 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010385 if (n == 0)
10386 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010388 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 buf2 = _PyUnicode_AsKind(str2, rkind);
10390 if (!buf2) goto error;
10391 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010394 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 rkind = kind2;
10396 sbuf = _PyUnicode_AsKind(self, rkind);
10397 if (!sbuf) goto error;
10398 srelease = 1;
10399 if (release1) PyMem_Free(buf1);
10400 buf1 = _PyUnicode_AsKind(str1, rkind);
10401 if (!buf1) goto error;
10402 release1 = 1;
10403 }
10404 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10405 PyUnicode_GET_LENGTH(str1))); */
10406 product = n * (len2-len1);
10407 if ((product / (len2-len1)) != n) {
10408 PyErr_SetString(PyExc_OverflowError,
10409 "replace string is too long");
10410 goto error;
10411 }
10412 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010413 if (new_size == 0) {
10414 Py_INCREF(unicode_empty);
10415 u = unicode_empty;
10416 goto done;
10417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10419 PyErr_SetString(PyExc_OverflowError,
10420 "replace string is too long");
10421 goto error;
10422 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010423 u = PyUnicode_New(new_size, maxchar);
10424 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010426 assert(PyUnicode_KIND(u) == rkind);
10427 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 ires = i = 0;
10429 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430 while (n-- > 0) {
10431 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010432 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010433 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010434 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010435 if (j == -1)
10436 break;
10437 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010438 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010439 memcpy(res + rkind * ires,
10440 sbuf + rkind * i,
10441 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010443 }
10444 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010446 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010448 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010455 memcpy(res + rkind * ires,
10456 sbuf + rkind * i,
10457 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010458 }
10459 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010460 /* interleave */
10461 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010462 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010464 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010466 if (--n <= 0)
10467 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010468 memcpy(res + rkind * ires,
10469 sbuf + rkind * i,
10470 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 ires++;
10472 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010474 memcpy(res + rkind * ires,
10475 sbuf + rkind * i,
10476 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010477 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010478 }
10479
10480 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010481 unicode_adjust_maxchar(&u);
10482 if (u == NULL)
10483 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010485
10486 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 if (srelease)
10488 PyMem_FREE(sbuf);
10489 if (release1)
10490 PyMem_FREE(buf1);
10491 if (release2)
10492 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010493 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010495
Benjamin Peterson29060642009-01-31 22:14:21 +000010496 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (srelease)
10499 PyMem_FREE(sbuf);
10500 if (release1)
10501 PyMem_FREE(buf1);
10502 if (release2)
10503 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010504 return unicode_result_unchanged(self);
10505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 error:
10507 if (srelease && sbuf)
10508 PyMem_FREE(sbuf);
10509 if (release1 && buf1)
10510 PyMem_FREE(buf1);
10511 if (release2 && buf2)
10512 PyMem_FREE(buf2);
10513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514}
10515
10516/* --- Unicode Object Methods --------------------------------------------- */
10517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010518PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520\n\
10521Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010522characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523
10524static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010525unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010527 if (PyUnicode_READY(self) == -1)
10528 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010529 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530}
10531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010532PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010533 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534\n\
10535Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010536have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537
10538static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010539unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010541 if (PyUnicode_READY(self) == -1)
10542 return NULL;
10543 if (PyUnicode_GET_LENGTH(self) == 0)
10544 return unicode_result_unchanged(self);
10545 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546}
10547
Benjamin Petersond5890c82012-01-14 13:23:30 -050010548PyDoc_STRVAR(casefold__doc__,
10549 "S.casefold() -> str\n\
10550\n\
10551Return a version of S suitable for caseless comparisons.");
10552
10553static PyObject *
10554unicode_casefold(PyObject *self)
10555{
10556 if (PyUnicode_READY(self) == -1)
10557 return NULL;
10558 if (PyUnicode_IS_ASCII(self))
10559 return ascii_upper_or_lower(self, 1);
10560 return case_operation(self, do_casefold);
10561}
10562
10563
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010564/* Argument converter. Coerces to a single unicode character */
10565
10566static int
10567convert_uc(PyObject *obj, void *addr)
10568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010570 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010571
Benjamin Peterson14339b62009-01-31 16:36:08 +000010572 uniobj = PyUnicode_FromObject(obj);
10573 if (uniobj == NULL) {
10574 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010575 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010576 return 0;
10577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010579 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010580 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010581 Py_DECREF(uniobj);
10582 return 0;
10583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010585 Py_DECREF(uniobj);
10586 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010587}
10588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010589PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010590 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010592Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010593done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594
10595static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010596unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010598 Py_ssize_t marg, left;
10599 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 Py_UCS4 fillchar = ' ';
10601
Victor Stinnere9a29352011-10-01 02:14:59 +020010602 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Benjamin Petersonbac79492012-01-14 13:34:47 -050010605 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606 return NULL;
10607
Victor Stinnerc4b49542011-12-11 22:44:26 +010010608 if (PyUnicode_GET_LENGTH(self) >= width)
10609 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610
Victor Stinnerc4b49542011-12-11 22:44:26 +010010611 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 left = marg / 2 + (marg & width & 1);
10613
Victor Stinner9310abb2011-10-05 00:59:23 +020010614 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615}
10616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617/* This function assumes that str1 and str2 are readied by the caller. */
10618
Marc-André Lemburge5034372000-08-08 08:04:29 +000010619static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010620unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 int kind1, kind2;
10623 void *data1, *data2;
10624 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 kind1 = PyUnicode_KIND(str1);
10627 kind2 = PyUnicode_KIND(str2);
10628 data1 = PyUnicode_DATA(str1);
10629 data2 = PyUnicode_DATA(str2);
10630 len1 = PyUnicode_GET_LENGTH(str1);
10631 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 for (i = 0; i < len1 && i < len2; ++i) {
10634 Py_UCS4 c1, c2;
10635 c1 = PyUnicode_READ(kind1, data1, i);
10636 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010637
10638 if (c1 != c2)
10639 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010640 }
10641
10642 return (len1 < len2) ? -1 : (len1 != len2);
10643}
10644
Alexander Belopolsky40018472011-02-26 01:02:56 +000010645int
10646PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10649 if (PyUnicode_READY(left) == -1 ||
10650 PyUnicode_READY(right) == -1)
10651 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010652 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010654 PyErr_Format(PyExc_TypeError,
10655 "Can't compare %.100s and %.100s",
10656 left->ob_type->tp_name,
10657 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658 return -1;
10659}
10660
Martin v. Löwis5b222132007-06-10 09:51:05 +000010661int
10662PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 Py_ssize_t i;
10665 int kind;
10666 void *data;
10667 Py_UCS4 chr;
10668
Victor Stinner910337b2011-10-03 03:20:16 +020010669 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 if (PyUnicode_READY(uni) == -1)
10671 return -1;
10672 kind = PyUnicode_KIND(uni);
10673 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010674 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10676 if (chr != str[i])
10677 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010678 /* This check keeps Python strings that end in '\0' from comparing equal
10679 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010681 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010682 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010683 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010684 return 0;
10685}
10686
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010687
Benjamin Peterson29060642009-01-31 22:14:21 +000010688#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010689 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010690
Alexander Belopolsky40018472011-02-26 01:02:56 +000010691PyObject *
10692PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010693{
10694 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010695
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010696 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10697 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 if (PyUnicode_READY(left) == -1 ||
10699 PyUnicode_READY(right) == -1)
10700 return NULL;
10701 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10702 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010703 if (op == Py_EQ) {
10704 Py_INCREF(Py_False);
10705 return Py_False;
10706 }
10707 if (op == Py_NE) {
10708 Py_INCREF(Py_True);
10709 return Py_True;
10710 }
10711 }
10712 if (left == right)
10713 result = 0;
10714 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010715 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010716
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010717 /* Convert the return value to a Boolean */
10718 switch (op) {
10719 case Py_EQ:
10720 v = TEST_COND(result == 0);
10721 break;
10722 case Py_NE:
10723 v = TEST_COND(result != 0);
10724 break;
10725 case Py_LE:
10726 v = TEST_COND(result <= 0);
10727 break;
10728 case Py_GE:
10729 v = TEST_COND(result >= 0);
10730 break;
10731 case Py_LT:
10732 v = TEST_COND(result == -1);
10733 break;
10734 case Py_GT:
10735 v = TEST_COND(result == 1);
10736 break;
10737 default:
10738 PyErr_BadArgument();
10739 return NULL;
10740 }
10741 Py_INCREF(v);
10742 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010744
Brian Curtindfc80e32011-08-10 20:28:54 -050010745 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010746}
10747
Alexander Belopolsky40018472011-02-26 01:02:56 +000010748int
10749PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010750{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010751 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 int kind1, kind2, kind;
10753 void *buf1, *buf2;
10754 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010755 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010756
10757 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010758 sub = PyUnicode_FromObject(element);
10759 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 PyErr_Format(PyExc_TypeError,
10761 "'in <string>' requires string as left operand, not %s",
10762 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010763 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010764 }
10765
Thomas Wouters477c8d52006-05-27 19:21:47 +000010766 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010767 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010768 Py_DECREF(sub);
10769 return -1;
10770 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010771 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10772 Py_DECREF(sub);
10773 Py_DECREF(str);
10774 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 kind1 = PyUnicode_KIND(str);
10777 kind2 = PyUnicode_KIND(sub);
10778 kind = kind1 > kind2 ? kind1 : kind2;
10779 buf1 = PyUnicode_DATA(str);
10780 buf2 = PyUnicode_DATA(sub);
10781 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010782 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if (!buf1) {
10784 Py_DECREF(sub);
10785 return -1;
10786 }
10787 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010788 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 if (!buf2) {
10790 Py_DECREF(sub);
10791 if (kind1 != kind) PyMem_Free(buf1);
10792 return -1;
10793 }
10794 len1 = PyUnicode_GET_LENGTH(str);
10795 len2 = PyUnicode_GET_LENGTH(sub);
10796
Benjamin Petersonead6b532011-12-20 17:23:42 -060010797 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 case PyUnicode_1BYTE_KIND:
10799 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10800 break;
10801 case PyUnicode_2BYTE_KIND:
10802 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10803 break;
10804 case PyUnicode_4BYTE_KIND:
10805 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10806 break;
10807 default:
10808 result = -1;
10809 assert(0);
10810 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010811
10812 Py_DECREF(str);
10813 Py_DECREF(sub);
10814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 if (kind1 != kind)
10816 PyMem_Free(buf1);
10817 if (kind2 != kind)
10818 PyMem_Free(buf2);
10819
Guido van Rossum403d68b2000-03-13 15:55:09 +000010820 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010821}
10822
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823/* Concat to string or Unicode object giving a new Unicode object. */
10824
Alexander Belopolsky40018472011-02-26 01:02:56 +000010825PyObject *
10826PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010829 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010830 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831
10832 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010835 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010838 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
10840 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010841 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010842 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010845 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010846 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848 }
10849
Victor Stinner488fa492011-12-12 00:01:39 +010010850 u_len = PyUnicode_GET_LENGTH(u);
10851 v_len = PyUnicode_GET_LENGTH(v);
10852 if (u_len > PY_SSIZE_T_MAX - v_len) {
10853 PyErr_SetString(PyExc_OverflowError,
10854 "strings are too large to concat");
10855 goto onError;
10856 }
10857 new_len = u_len + v_len;
10858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010860 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10861 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010864 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010866 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010867 copy_characters(w, 0, u, 0, u_len);
10868 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869 Py_DECREF(u);
10870 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010871 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875 Py_XDECREF(u);
10876 Py_XDECREF(v);
10877 return NULL;
10878}
10879
Walter Dörwald1ab83302007-05-18 17:15:44 +000010880void
Victor Stinner23e56682011-10-03 03:54:37 +020010881PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010882{
Victor Stinner23e56682011-10-03 03:54:37 +020010883 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010884 Py_UCS4 maxchar, maxchar2;
10885 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010886
10887 if (p_left == NULL) {
10888 if (!PyErr_Occurred())
10889 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010890 return;
10891 }
Victor Stinner23e56682011-10-03 03:54:37 +020010892 left = *p_left;
10893 if (right == NULL || !PyUnicode_Check(left)) {
10894 if (!PyErr_Occurred())
10895 PyErr_BadInternalCall();
10896 goto error;
10897 }
10898
Benjamin Petersonbac79492012-01-14 13:34:47 -050010899 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010900 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010901 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010902 goto error;
10903
Victor Stinner488fa492011-12-12 00:01:39 +010010904 /* Shortcuts */
10905 if (left == unicode_empty) {
10906 Py_DECREF(left);
10907 Py_INCREF(right);
10908 *p_left = right;
10909 return;
10910 }
10911 if (right == unicode_empty)
10912 return;
10913
10914 left_len = PyUnicode_GET_LENGTH(left);
10915 right_len = PyUnicode_GET_LENGTH(right);
10916 if (left_len > PY_SSIZE_T_MAX - right_len) {
10917 PyErr_SetString(PyExc_OverflowError,
10918 "strings are too large to concat");
10919 goto error;
10920 }
10921 new_len = left_len + right_len;
10922
10923 if (unicode_modifiable(left)
10924 && PyUnicode_CheckExact(right)
10925 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010926 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10927 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010928 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010929 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010930 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10931 {
10932 /* append inplace */
10933 if (unicode_resize(p_left, new_len) != 0) {
10934 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10935 * deallocated so it cannot be put back into
10936 * 'variable'. The MemoryError is raised when there
10937 * is no value in 'variable', which might (very
10938 * remotely) be a cause of incompatibilities.
10939 */
10940 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010941 }
Victor Stinner488fa492011-12-12 00:01:39 +010010942 /* copy 'right' into the newly allocated area of 'left' */
10943 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010944 }
Victor Stinner488fa492011-12-12 00:01:39 +010010945 else {
10946 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10947 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10948 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010949
Victor Stinner488fa492011-12-12 00:01:39 +010010950 /* Concat the two Unicode strings */
10951 res = PyUnicode_New(new_len, maxchar);
10952 if (res == NULL)
10953 goto error;
10954 copy_characters(res, 0, left, 0, left_len);
10955 copy_characters(res, left_len, right, 0, right_len);
10956 Py_DECREF(left);
10957 *p_left = res;
10958 }
10959 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010960 return;
10961
10962error:
Victor Stinner488fa492011-12-12 00:01:39 +010010963 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010964}
10965
10966void
10967PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10968{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010969 PyUnicode_Append(pleft, right);
10970 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010971}
10972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010973PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010974 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010976Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010977string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010978interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979
10980static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010981unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010983 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010984 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010985 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 int kind1, kind2, kind;
10988 void *buf1, *buf2;
10989 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990
Jesus Ceaac451502011-04-20 17:09:23 +020010991 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10992 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 kind1 = PyUnicode_KIND(self);
10996 kind2 = PyUnicode_KIND(substring);
10997 kind = kind1 > kind2 ? kind1 : kind2;
10998 buf1 = PyUnicode_DATA(self);
10999 buf2 = PyUnicode_DATA(substring);
11000 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011001 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 if (!buf1) {
11003 Py_DECREF(substring);
11004 return NULL;
11005 }
11006 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011007 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (!buf2) {
11009 Py_DECREF(substring);
11010 if (kind1 != kind) PyMem_Free(buf1);
11011 return NULL;
11012 }
11013 len1 = PyUnicode_GET_LENGTH(self);
11014 len2 = PyUnicode_GET_LENGTH(substring);
11015
11016 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011017 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 case PyUnicode_1BYTE_KIND:
11019 iresult = ucs1lib_count(
11020 ((Py_UCS1*)buf1) + start, end - start,
11021 buf2, len2, PY_SSIZE_T_MAX
11022 );
11023 break;
11024 case PyUnicode_2BYTE_KIND:
11025 iresult = ucs2lib_count(
11026 ((Py_UCS2*)buf1) + start, end - start,
11027 buf2, len2, PY_SSIZE_T_MAX
11028 );
11029 break;
11030 case PyUnicode_4BYTE_KIND:
11031 iresult = ucs4lib_count(
11032 ((Py_UCS4*)buf1) + start, end - start,
11033 buf2, len2, PY_SSIZE_T_MAX
11034 );
11035 break;
11036 default:
11037 assert(0); iresult = 0;
11038 }
11039
11040 result = PyLong_FromSsize_t(iresult);
11041
11042 if (kind1 != kind)
11043 PyMem_Free(buf1);
11044 if (kind2 != kind)
11045 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046
11047 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011048
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049 return result;
11050}
11051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011052PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011053 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011055Encode S using the codec registered for encoding. Default encoding\n\
11056is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011057handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011058a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11059'xmlcharrefreplace' as well as any other name registered with\n\
11060codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061
11062static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011063unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011065 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066 char *encoding = NULL;
11067 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011068
Benjamin Peterson308d6372009-09-18 21:42:35 +000011069 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11070 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011072 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011073}
11074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011075PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011076 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077\n\
11078Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011079If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080
11081static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011082unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011084 Py_ssize_t i, j, line_pos, src_len, incr;
11085 Py_UCS4 ch;
11086 PyObject *u;
11087 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011089 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011090 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091
11092 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094
Antoine Pitrou22425222011-10-04 19:10:51 +020011095 if (PyUnicode_READY(self) == -1)
11096 return NULL;
11097
Thomas Wouters7e474022000-07-16 12:04:32 +000011098 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011099 src_len = PyUnicode_GET_LENGTH(self);
11100 i = j = line_pos = 0;
11101 kind = PyUnicode_KIND(self);
11102 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011103 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011104 for (; i < src_len; i++) {
11105 ch = PyUnicode_READ(kind, src_data, i);
11106 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011107 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011108 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011109 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011110 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011111 goto overflow;
11112 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011114 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011115 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011118 goto overflow;
11119 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011121 if (ch == '\n' || ch == '\r')
11122 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011124 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011125 if (!found)
11126 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011127
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011129 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130 if (!u)
11131 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011132 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133
Antoine Pitroue71d5742011-10-04 15:55:09 +020011134 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
Antoine Pitroue71d5742011-10-04 15:55:09 +020011136 for (; i < src_len; i++) {
11137 ch = PyUnicode_READ(kind, src_data, i);
11138 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011140 incr = tabsize - (line_pos % tabsize);
11141 line_pos += incr;
11142 while (incr--) {
11143 PyUnicode_WRITE(kind, dest_data, j, ' ');
11144 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011145 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011147 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011149 line_pos++;
11150 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011151 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011152 if (ch == '\n' || ch == '\r')
11153 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011155 }
11156 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011157 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011158
Antoine Pitroue71d5742011-10-04 15:55:09 +020011159 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011160 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11161 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162}
11163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011164PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011165 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166\n\
11167Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011168such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169arguments start and end are interpreted as in slice notation.\n\
11170\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011171Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172
11173static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011176 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011177 Py_ssize_t start;
11178 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
Jesus Ceaac451502011-04-20 17:09:23 +020011181 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11182 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 if (PyUnicode_READY(self) == -1)
11186 return NULL;
11187 if (PyUnicode_READY(substring) == -1)
11188 return NULL;
11189
Victor Stinner7931d9a2011-11-04 00:22:48 +010011190 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
11192 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 if (result == -2)
11195 return NULL;
11196
Christian Heimes217cfd12007-12-02 14:31:20 +000011197 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198}
11199
11200static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011201unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011203 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11204 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207}
11208
Guido van Rossumc2504932007-09-18 19:42:40 +000011209/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011210 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011211static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011212unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213{
Guido van Rossumc2504932007-09-18 19:42:40 +000011214 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011215 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 if (_PyUnicode_HASH(self) != -1)
11218 return _PyUnicode_HASH(self);
11219 if (PyUnicode_READY(self) == -1)
11220 return -1;
11221 len = PyUnicode_GET_LENGTH(self);
11222
11223 /* The hash function as a macro, gets expanded three times below. */
11224#define HASH(P) \
11225 x = (Py_uhash_t)*P << 7; \
11226 while (--len >= 0) \
Gregory P. Smithf5b62a92012-01-14 15:45:13 -080011227 x = (_PyHASH_MULTIPLIER*x) ^ (Py_uhash_t)*P++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228
11229 switch (PyUnicode_KIND(self)) {
11230 case PyUnicode_1BYTE_KIND: {
11231 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11232 HASH(c);
11233 break;
11234 }
11235 case PyUnicode_2BYTE_KIND: {
11236 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11237 HASH(s);
11238 break;
11239 }
11240 default: {
11241 Py_UCS4 *l;
11242 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11243 "Impossible switch case in unicode_hash");
11244 l = PyUnicode_4BYTE_DATA(self);
11245 HASH(l);
11246 break;
11247 }
11248 }
11249 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11250
Guido van Rossumc2504932007-09-18 19:42:40 +000011251 if (x == -1)
11252 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011254 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011258PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011261Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
11263static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011266 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011267 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011268 Py_ssize_t start;
11269 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270
Jesus Ceaac451502011-04-20 17:09:23 +020011271 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11272 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 if (PyUnicode_READY(self) == -1)
11276 return NULL;
11277 if (PyUnicode_READY(substring) == -1)
11278 return NULL;
11279
Victor Stinner7931d9a2011-11-04 00:22:48 +010011280 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
11282 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 if (result == -2)
11285 return NULL;
11286
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287 if (result < 0) {
11288 PyErr_SetString(PyExc_ValueError, "substring not found");
11289 return NULL;
11290 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011291
Christian Heimes217cfd12007-12-02 14:31:20 +000011292 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293}
11294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011295PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011298Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011299at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300
11301static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011302unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 Py_ssize_t i, length;
11305 int kind;
11306 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307 int cased;
11308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 if (PyUnicode_READY(self) == -1)
11310 return NULL;
11311 length = PyUnicode_GET_LENGTH(self);
11312 kind = PyUnicode_KIND(self);
11313 data = PyUnicode_DATA(self);
11314
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 if (length == 1)
11317 return PyBool_FromLong(
11318 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011320 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011323
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 for (i = 0; i < length; i++) {
11326 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011327
Benjamin Peterson29060642009-01-31 22:14:21 +000011328 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11329 return PyBool_FromLong(0);
11330 else if (!cased && Py_UNICODE_ISLOWER(ch))
11331 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011333 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334}
11335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011336PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011337 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011339Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011340at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
11342static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011343unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 Py_ssize_t i, length;
11346 int kind;
11347 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348 int cased;
11349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 if (PyUnicode_READY(self) == -1)
11351 return NULL;
11352 length = PyUnicode_GET_LENGTH(self);
11353 kind = PyUnicode_KIND(self);
11354 data = PyUnicode_DATA(self);
11355
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 if (length == 1)
11358 return PyBool_FromLong(
11359 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011361 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011363 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011364
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 for (i = 0; i < length; i++) {
11367 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011368
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11370 return PyBool_FromLong(0);
11371 else if (!cased && Py_UNICODE_ISUPPER(ch))
11372 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011374 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375}
11376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011377PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011380Return True if S is a titlecased string and there is at least one\n\
11381character in S, i.e. upper- and titlecase characters may only\n\
11382follow uncased characters and lowercase characters only cased ones.\n\
11383Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384
11385static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011386unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 Py_ssize_t i, length;
11389 int kind;
11390 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391 int cased, previous_is_cased;
11392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 if (PyUnicode_READY(self) == -1)
11394 return NULL;
11395 length = PyUnicode_GET_LENGTH(self);
11396 kind = PyUnicode_KIND(self);
11397 data = PyUnicode_DATA(self);
11398
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 if (length == 1) {
11401 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11402 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11403 (Py_UNICODE_ISUPPER(ch) != 0));
11404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011406 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011409
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410 cased = 0;
11411 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 for (i = 0; i < length; i++) {
11413 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011414
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11416 if (previous_is_cased)
11417 return PyBool_FromLong(0);
11418 previous_is_cased = 1;
11419 cased = 1;
11420 }
11421 else if (Py_UNICODE_ISLOWER(ch)) {
11422 if (!previous_is_cased)
11423 return PyBool_FromLong(0);
11424 previous_is_cased = 1;
11425 cased = 1;
11426 }
11427 else
11428 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011430 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431}
11432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011436Return True if all characters in S are whitespace\n\
11437and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
11439static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011440unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 Py_ssize_t i, length;
11443 int kind;
11444 void *data;
11445
11446 if (PyUnicode_READY(self) == -1)
11447 return NULL;
11448 length = PyUnicode_GET_LENGTH(self);
11449 kind = PyUnicode_KIND(self);
11450 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 if (length == 1)
11454 return PyBool_FromLong(
11455 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011457 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 for (i = 0; i < length; i++) {
11462 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011463 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011466 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467}
11468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011469PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011471\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011472Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011473and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011474
11475static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011476unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 Py_ssize_t i, length;
11479 int kind;
11480 void *data;
11481
11482 if (PyUnicode_READY(self) == -1)
11483 return NULL;
11484 length = PyUnicode_GET_LENGTH(self);
11485 kind = PyUnicode_KIND(self);
11486 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011487
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011488 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 if (length == 1)
11490 return PyBool_FromLong(
11491 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011492
11493 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 for (i = 0; i < length; i++) {
11498 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011500 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011501 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011502}
11503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011504PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011506\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011507Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011508and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011509
11510static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011511unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 int kind;
11514 void *data;
11515 Py_ssize_t len, i;
11516
11517 if (PyUnicode_READY(self) == -1)
11518 return NULL;
11519
11520 kind = PyUnicode_KIND(self);
11521 data = PyUnicode_DATA(self);
11522 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011523
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011524 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 if (len == 1) {
11526 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11527 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11528 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011529
11530 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 for (i = 0; i < len; i++) {
11535 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011536 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011538 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011539 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011540}
11541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011542PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011543 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011545Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011546False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
11548static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011549unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 Py_ssize_t i, length;
11552 int kind;
11553 void *data;
11554
11555 if (PyUnicode_READY(self) == -1)
11556 return NULL;
11557 length = PyUnicode_GET_LENGTH(self);
11558 kind = PyUnicode_KIND(self);
11559 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 if (length == 1)
11563 return PyBool_FromLong(
11564 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011566 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 for (i = 0; i < length; i++) {
11571 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011574 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575}
11576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011577PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011580Return True if all characters in S are digits\n\
11581and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
11583static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011584unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 Py_ssize_t i, length;
11587 int kind;
11588 void *data;
11589
11590 if (PyUnicode_READY(self) == -1)
11591 return NULL;
11592 length = PyUnicode_GET_LENGTH(self);
11593 kind = PyUnicode_KIND(self);
11594 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597 if (length == 1) {
11598 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11599 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011602 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 for (i = 0; i < length; i++) {
11607 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011608 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011610 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611}
11612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011613PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011616Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
11619static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011620unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 Py_ssize_t i, length;
11623 int kind;
11624 void *data;
11625
11626 if (PyUnicode_READY(self) == -1)
11627 return NULL;
11628 length = PyUnicode_GET_LENGTH(self);
11629 kind = PyUnicode_KIND(self);
11630 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 if (length == 1)
11634 return PyBool_FromLong(
11635 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011637 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 for (i = 0; i < length; i++) {
11642 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011643 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011645 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646}
11647
Martin v. Löwis47383402007-08-15 07:32:56 +000011648int
11649PyUnicode_IsIdentifier(PyObject *self)
11650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 int kind;
11652 void *data;
11653 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011654 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 if (PyUnicode_READY(self) == -1) {
11657 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 }
11660
11661 /* Special case for empty strings */
11662 if (PyUnicode_GET_LENGTH(self) == 0)
11663 return 0;
11664 kind = PyUnicode_KIND(self);
11665 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011666
11667 /* PEP 3131 says that the first character must be in
11668 XID_Start and subsequent characters in XID_Continue,
11669 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011670 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011671 letters, digits, underscore). However, given the current
11672 definition of XID_Start and XID_Continue, it is sufficient
11673 to check just for these, except that _ must be allowed
11674 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011676 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011677 return 0;
11678
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011679 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011681 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011682 return 1;
11683}
11684
11685PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011686 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011687\n\
11688Return True if S is a valid identifier according\n\
11689to the language definition.");
11690
11691static PyObject*
11692unicode_isidentifier(PyObject *self)
11693{
11694 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11695}
11696
Georg Brandl559e5d72008-06-11 18:37:52 +000011697PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011698 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011699\n\
11700Return True if all characters in S are considered\n\
11701printable in repr() or S is empty, False otherwise.");
11702
11703static PyObject*
11704unicode_isprintable(PyObject *self)
11705{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 Py_ssize_t i, length;
11707 int kind;
11708 void *data;
11709
11710 if (PyUnicode_READY(self) == -1)
11711 return NULL;
11712 length = PyUnicode_GET_LENGTH(self);
11713 kind = PyUnicode_KIND(self);
11714 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011715
11716 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 if (length == 1)
11718 return PyBool_FromLong(
11719 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 for (i = 0; i < length; i++) {
11722 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011723 Py_RETURN_FALSE;
11724 }
11725 }
11726 Py_RETURN_TRUE;
11727}
11728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011729PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011730 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731\n\
11732Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011733iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734
11735static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011736unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011738 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739}
11740
Martin v. Löwis18e16552006-02-15 17:27:45 +000011741static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011742unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (PyUnicode_READY(self) == -1)
11745 return -1;
11746 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747}
11748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011749PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011752Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011753done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754
11755static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011756unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011758 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 Py_UCS4 fillchar = ' ';
11760
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011761 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 return NULL;
11763
Benjamin Petersonbac79492012-01-14 13:34:47 -050011764 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
Victor Stinnerc4b49542011-12-11 22:44:26 +010011767 if (PyUnicode_GET_LENGTH(self) >= width)
11768 return unicode_result_unchanged(self);
11769
11770 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771}
11772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011773PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011776Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
11778static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011779unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011781 if (PyUnicode_READY(self) == -1)
11782 return NULL;
11783 if (PyUnicode_IS_ASCII(self))
11784 return ascii_upper_or_lower(self, 1);
11785 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786}
11787
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011788#define LEFTSTRIP 0
11789#define RIGHTSTRIP 1
11790#define BOTHSTRIP 2
11791
11792/* Arrays indexed by above */
11793static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11794
11795#define STRIPNAME(i) (stripformat[i]+3)
11796
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011797/* externally visible for str.strip(unicode) */
11798PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011799_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 void *data;
11802 int kind;
11803 Py_ssize_t i, j, len;
11804 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11807 return NULL;
11808
11809 kind = PyUnicode_KIND(self);
11810 data = PyUnicode_DATA(self);
11811 len = PyUnicode_GET_LENGTH(self);
11812 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11813 PyUnicode_DATA(sepobj),
11814 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011815
Benjamin Peterson14339b62009-01-31 16:36:08 +000011816 i = 0;
11817 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 while (i < len &&
11819 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 i++;
11821 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011822 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011823
Benjamin Peterson14339b62009-01-31 16:36:08 +000011824 j = len;
11825 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 do {
11827 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 } while (j >= i &&
11829 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011831 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832
Victor Stinner7931d9a2011-11-04 00:22:48 +010011833 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011834}
11835
11836PyObject*
11837PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11838{
11839 unsigned char *data;
11840 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011841 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842
Victor Stinnerde636f32011-10-01 03:55:54 +020011843 if (PyUnicode_READY(self) == -1)
11844 return NULL;
11845
11846 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11847
Victor Stinner12bab6d2011-10-01 01:53:49 +020011848 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011849 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850
Victor Stinner12bab6d2011-10-01 01:53:49 +020011851 length = end - start;
11852 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011853 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854
Victor Stinnerde636f32011-10-01 03:55:54 +020011855 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011856 PyErr_SetString(PyExc_IndexError, "string index out of range");
11857 return NULL;
11858 }
11859
Victor Stinnerb9275c12011-10-05 14:01:42 +020011860 if (PyUnicode_IS_ASCII(self)) {
11861 kind = PyUnicode_KIND(self);
11862 data = PyUnicode_1BYTE_DATA(self);
11863 return unicode_fromascii(data + start, length);
11864 }
11865 else {
11866 kind = PyUnicode_KIND(self);
11867 data = PyUnicode_1BYTE_DATA(self);
11868 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011869 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011870 length);
11871 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873
11874static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011875do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 int kind;
11878 void *data;
11879 Py_ssize_t len, i, j;
11880
11881 if (PyUnicode_READY(self) == -1)
11882 return NULL;
11883
11884 kind = PyUnicode_KIND(self);
11885 data = PyUnicode_DATA(self);
11886 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011887
Benjamin Peterson14339b62009-01-31 16:36:08 +000011888 i = 0;
11889 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011891 i++;
11892 }
11893 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011894
Benjamin Peterson14339b62009-01-31 16:36:08 +000011895 j = len;
11896 if (striptype != LEFTSTRIP) {
11897 do {
11898 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011900 j++;
11901 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011902
Victor Stinner7931d9a2011-11-04 00:22:48 +010011903 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904}
11905
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011906
11907static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011908do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011909{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011910 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011911
Benjamin Peterson14339b62009-01-31 16:36:08 +000011912 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11913 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011914
Benjamin Peterson14339b62009-01-31 16:36:08 +000011915 if (sep != NULL && sep != Py_None) {
11916 if (PyUnicode_Check(sep))
11917 return _PyUnicode_XStrip(self, striptype, sep);
11918 else {
11919 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 "%s arg must be None or str",
11921 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011922 return NULL;
11923 }
11924 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011925
Benjamin Peterson14339b62009-01-31 16:36:08 +000011926 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011927}
11928
11929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011930PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011932\n\
11933Return a copy of the string S with leading and trailing\n\
11934whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011935If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011936
11937static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011938unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011939{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011940 if (PyTuple_GET_SIZE(args) == 0)
11941 return do_strip(self, BOTHSTRIP); /* Common case */
11942 else
11943 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011944}
11945
11946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011947PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011949\n\
11950Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011951If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011952
11953static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011954unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011955{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011956 if (PyTuple_GET_SIZE(args) == 0)
11957 return do_strip(self, LEFTSTRIP); /* Common case */
11958 else
11959 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011960}
11961
11962
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011963PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011965\n\
11966Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011967If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011968
11969static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011970unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011971{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011972 if (PyTuple_GET_SIZE(args) == 0)
11973 return do_strip(self, RIGHTSTRIP); /* Common case */
11974 else
11975 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011976}
11977
11978
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011980unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011982 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
Georg Brandl222de0f2009-04-12 12:01:50 +000011985 if (len < 1) {
11986 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011987 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989
Victor Stinnerc4b49542011-12-11 22:44:26 +010011990 /* no repeat, return original string */
11991 if (len == 1)
11992 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011993
Benjamin Petersonbac79492012-01-14 13:34:47 -050011994 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 return NULL;
11996
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011997 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011998 PyErr_SetString(PyExc_OverflowError,
11999 "repeated string is too long");
12000 return NULL;
12001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012003
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012004 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005 if (!u)
12006 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012007 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (PyUnicode_GET_LENGTH(str) == 1) {
12010 const int kind = PyUnicode_KIND(str);
12011 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012012 if (kind == PyUnicode_1BYTE_KIND) {
12013 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012014 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012015 }
12016 else if (kind == PyUnicode_2BYTE_KIND) {
12017 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012018 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012019 ucs2[n] = fill_char;
12020 } else {
12021 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12022 assert(kind == PyUnicode_4BYTE_KIND);
12023 for (n = 0; n < len; ++n)
12024 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 }
12027 else {
12028 /* number of characters copied this far */
12029 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012030 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 char *to = (char *) PyUnicode_DATA(u);
12032 Py_MEMCPY(to, PyUnicode_DATA(str),
12033 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012034 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 n = (done <= nchars-done) ? done : nchars-done;
12036 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012037 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039 }
12040
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012041 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012042 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043}
12044
Alexander Belopolsky40018472011-02-26 01:02:56 +000012045PyObject *
12046PyUnicode_Replace(PyObject *obj,
12047 PyObject *subobj,
12048 PyObject *replobj,
12049 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050{
12051 PyObject *self;
12052 PyObject *str1;
12053 PyObject *str2;
12054 PyObject *result;
12055
12056 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012057 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012060 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 Py_DECREF(self);
12062 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063 }
12064 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012065 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 Py_DECREF(self);
12067 Py_DECREF(str1);
12068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012070 if (PyUnicode_READY(self) == -1 ||
12071 PyUnicode_READY(str1) == -1 ||
12072 PyUnicode_READY(str2) == -1)
12073 result = NULL;
12074 else
12075 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076 Py_DECREF(self);
12077 Py_DECREF(str1);
12078 Py_DECREF(str2);
12079 return result;
12080}
12081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012082PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012083 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084\n\
12085Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012086old replaced by new. If the optional argument count is\n\
12087given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
12089static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 PyObject *str1;
12093 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012094 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 PyObject *result;
12096
Martin v. Löwis18e16552006-02-15 17:27:45 +000012097 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012099 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012102 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 return NULL;
12104 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012105 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012106 Py_DECREF(str1);
12107 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012108 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012109 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12110 result = NULL;
12111 else
12112 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113
12114 Py_DECREF(str1);
12115 Py_DECREF(str2);
12116 return result;
12117}
12118
Alexander Belopolsky40018472011-02-26 01:02:56 +000012119static PyObject *
12120unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012122 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 Py_ssize_t isize;
12124 Py_ssize_t osize, squote, dquote, i, o;
12125 Py_UCS4 max, quote;
12126 int ikind, okind;
12127 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012130 return NULL;
12131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 isize = PyUnicode_GET_LENGTH(unicode);
12133 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 /* Compute length of output, quote characters, and
12136 maximum character */
12137 osize = 2; /* quotes */
12138 max = 127;
12139 squote = dquote = 0;
12140 ikind = PyUnicode_KIND(unicode);
12141 for (i = 0; i < isize; i++) {
12142 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12143 switch (ch) {
12144 case '\'': squote++; osize++; break;
12145 case '"': dquote++; osize++; break;
12146 case '\\': case '\t': case '\r': case '\n':
12147 osize += 2; break;
12148 default:
12149 /* Fast-path ASCII */
12150 if (ch < ' ' || ch == 0x7f)
12151 osize += 4; /* \xHH */
12152 else if (ch < 0x7f)
12153 osize++;
12154 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12155 osize++;
12156 max = ch > max ? ch : max;
12157 }
12158 else if (ch < 0x100)
12159 osize += 4; /* \xHH */
12160 else if (ch < 0x10000)
12161 osize += 6; /* \uHHHH */
12162 else
12163 osize += 10; /* \uHHHHHHHH */
12164 }
12165 }
12166
12167 quote = '\'';
12168 if (squote) {
12169 if (dquote)
12170 /* Both squote and dquote present. Use squote,
12171 and escape them */
12172 osize += squote;
12173 else
12174 quote = '"';
12175 }
12176
12177 repr = PyUnicode_New(osize, max);
12178 if (repr == NULL)
12179 return NULL;
12180 okind = PyUnicode_KIND(repr);
12181 odata = PyUnicode_DATA(repr);
12182
12183 PyUnicode_WRITE(okind, odata, 0, quote);
12184 PyUnicode_WRITE(okind, odata, osize-1, quote);
12185
12186 for (i = 0, o = 1; i < isize; i++) {
12187 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012188
12189 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if ((ch == quote) || (ch == '\\')) {
12191 PyUnicode_WRITE(okind, odata, o++, '\\');
12192 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012193 continue;
12194 }
12195
Benjamin Peterson29060642009-01-31 22:14:21 +000012196 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012197 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 PyUnicode_WRITE(okind, odata, o++, '\\');
12199 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012200 }
12201 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 PyUnicode_WRITE(okind, odata, o++, '\\');
12203 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012204 }
12205 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 PyUnicode_WRITE(okind, odata, o++, '\\');
12207 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012208 }
12209
12210 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012211 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 PyUnicode_WRITE(okind, odata, o++, '\\');
12213 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012214 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12215 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012216 }
12217
Georg Brandl559e5d72008-06-11 18:37:52 +000012218 /* Copy ASCII characters as-is */
12219 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012221 }
12222
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012224 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012225 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012226 (categories Z* and C* except ASCII space)
12227 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012229 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 if (ch <= 0xff) {
12231 PyUnicode_WRITE(okind, odata, o++, '\\');
12232 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012233 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12234 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012235 }
12236 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 else if (ch >= 0x10000) {
12238 PyUnicode_WRITE(okind, odata, o++, '\\');
12239 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012240 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12241 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12242 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12243 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12244 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12245 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12246 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12247 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012248 }
12249 /* Map 16-bit characters to '\uxxxx' */
12250 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 PyUnicode_WRITE(okind, odata, o++, '\\');
12252 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012253 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12254 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12255 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12256 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012257 }
12258 }
12259 /* Copy characters as-is */
12260 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012262 }
12263 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012266 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012267 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268}
12269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012270PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272\n\
12273Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012274such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275arguments start and end are interpreted as in slice notation.\n\
12276\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012277Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278
12279static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012282 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012283 Py_ssize_t start;
12284 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012285 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286
Jesus Ceaac451502011-04-20 17:09:23 +020012287 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12288 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 if (PyUnicode_READY(self) == -1)
12292 return NULL;
12293 if (PyUnicode_READY(substring) == -1)
12294 return NULL;
12295
Victor Stinner7931d9a2011-11-04 00:22:48 +010012296 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297
12298 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 if (result == -2)
12301 return NULL;
12302
Christian Heimes217cfd12007-12-02 14:31:20 +000012303 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304}
12305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012306PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012309Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310
12311static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012314 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012315 Py_ssize_t start;
12316 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012317 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318
Jesus Ceaac451502011-04-20 17:09:23 +020012319 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12320 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 if (PyUnicode_READY(self) == -1)
12324 return NULL;
12325 if (PyUnicode_READY(substring) == -1)
12326 return NULL;
12327
Victor Stinner7931d9a2011-11-04 00:22:48 +010012328 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329
12330 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 if (result == -2)
12333 return NULL;
12334
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335 if (result < 0) {
12336 PyErr_SetString(PyExc_ValueError, "substring not found");
12337 return NULL;
12338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339
Christian Heimes217cfd12007-12-02 14:31:20 +000012340 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341}
12342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012343PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012346Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012347done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348
12349static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012350unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012352 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 Py_UCS4 fillchar = ' ';
12354
Victor Stinnere9a29352011-10-01 02:14:59 +020012355 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012357
Benjamin Petersonbac79492012-01-14 13:34:47 -050012358 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359 return NULL;
12360
Victor Stinnerc4b49542011-12-11 22:44:26 +010012361 if (PyUnicode_GET_LENGTH(self) >= width)
12362 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363
Victor Stinnerc4b49542011-12-11 22:44:26 +010012364 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365}
12366
Alexander Belopolsky40018472011-02-26 01:02:56 +000012367PyObject *
12368PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369{
12370 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012371
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372 s = PyUnicode_FromObject(s);
12373 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012374 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 if (sep != NULL) {
12376 sep = PyUnicode_FromObject(sep);
12377 if (sep == NULL) {
12378 Py_DECREF(s);
12379 return NULL;
12380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381 }
12382
Victor Stinner9310abb2011-10-05 00:59:23 +020012383 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384
12385 Py_DECREF(s);
12386 Py_XDECREF(sep);
12387 return result;
12388}
12389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012390PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012391 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392\n\
12393Return a list of the words in S, using sep as the\n\
12394delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012395splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012396whitespace string is a separator and empty strings are\n\
12397removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012398
12399static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012400unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401{
12402 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012403 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404
Martin v. Löwis18e16552006-02-15 17:27:45 +000012405 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406 return NULL;
12407
12408 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012411 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012413 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414}
12415
Thomas Wouters477c8d52006-05-27 19:21:47 +000012416PyObject *
12417PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12418{
12419 PyObject* str_obj;
12420 PyObject* sep_obj;
12421 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 int kind1, kind2, kind;
12423 void *buf1 = NULL, *buf2 = NULL;
12424 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012425
12426 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012427 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012428 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012429 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012430 if (!sep_obj) {
12431 Py_DECREF(str_obj);
12432 return NULL;
12433 }
12434 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12435 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012436 Py_DECREF(str_obj);
12437 return NULL;
12438 }
12439
Victor Stinner14f8f022011-10-05 20:58:25 +020012440 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012442 kind = Py_MAX(kind1, kind2);
12443 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012445 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 if (!buf1)
12447 goto onError;
12448 buf2 = PyUnicode_DATA(sep_obj);
12449 if (kind2 != kind)
12450 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12451 if (!buf2)
12452 goto onError;
12453 len1 = PyUnicode_GET_LENGTH(str_obj);
12454 len2 = PyUnicode_GET_LENGTH(sep_obj);
12455
Benjamin Petersonead6b532011-12-20 17:23:42 -060012456 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012458 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12459 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12460 else
12461 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 break;
12463 case PyUnicode_2BYTE_KIND:
12464 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12465 break;
12466 case PyUnicode_4BYTE_KIND:
12467 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12468 break;
12469 default:
12470 assert(0);
12471 out = 0;
12472 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012473
12474 Py_DECREF(sep_obj);
12475 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 if (kind1 != kind)
12477 PyMem_Free(buf1);
12478 if (kind2 != kind)
12479 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012480
12481 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 onError:
12483 Py_DECREF(sep_obj);
12484 Py_DECREF(str_obj);
12485 if (kind1 != kind && buf1)
12486 PyMem_Free(buf1);
12487 if (kind2 != kind && buf2)
12488 PyMem_Free(buf2);
12489 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012490}
12491
12492
12493PyObject *
12494PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12495{
12496 PyObject* str_obj;
12497 PyObject* sep_obj;
12498 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 int kind1, kind2, kind;
12500 void *buf1 = NULL, *buf2 = NULL;
12501 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012502
12503 str_obj = PyUnicode_FromObject(str_in);
12504 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012506 sep_obj = PyUnicode_FromObject(sep_in);
12507 if (!sep_obj) {
12508 Py_DECREF(str_obj);
12509 return NULL;
12510 }
12511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 kind1 = PyUnicode_KIND(str_in);
12513 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012514 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 buf1 = PyUnicode_DATA(str_in);
12516 if (kind1 != kind)
12517 buf1 = _PyUnicode_AsKind(str_in, kind);
12518 if (!buf1)
12519 goto onError;
12520 buf2 = PyUnicode_DATA(sep_obj);
12521 if (kind2 != kind)
12522 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12523 if (!buf2)
12524 goto onError;
12525 len1 = PyUnicode_GET_LENGTH(str_obj);
12526 len2 = PyUnicode_GET_LENGTH(sep_obj);
12527
Benjamin Petersonead6b532011-12-20 17:23:42 -060012528 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012530 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12531 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12532 else
12533 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 break;
12535 case PyUnicode_2BYTE_KIND:
12536 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12537 break;
12538 case PyUnicode_4BYTE_KIND:
12539 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12540 break;
12541 default:
12542 assert(0);
12543 out = 0;
12544 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012545
12546 Py_DECREF(sep_obj);
12547 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 if (kind1 != kind)
12549 PyMem_Free(buf1);
12550 if (kind2 != kind)
12551 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012552
12553 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 onError:
12555 Py_DECREF(sep_obj);
12556 Py_DECREF(str_obj);
12557 if (kind1 != kind && buf1)
12558 PyMem_Free(buf1);
12559 if (kind2 != kind && buf2)
12560 PyMem_Free(buf2);
12561 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012562}
12563
12564PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012566\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012567Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012568the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012569found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012570
12571static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012572unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012573{
Victor Stinner9310abb2011-10-05 00:59:23 +020012574 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012575}
12576
12577PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012578 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012579\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012580Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012581the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012582separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012583
12584static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012585unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012586{
Victor Stinner9310abb2011-10-05 00:59:23 +020012587 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012588}
12589
Alexander Belopolsky40018472011-02-26 01:02:56 +000012590PyObject *
12591PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012592{
12593 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012594
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012595 s = PyUnicode_FromObject(s);
12596 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012597 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012598 if (sep != NULL) {
12599 sep = PyUnicode_FromObject(sep);
12600 if (sep == NULL) {
12601 Py_DECREF(s);
12602 return NULL;
12603 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012604 }
12605
Victor Stinner9310abb2011-10-05 00:59:23 +020012606 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012607
12608 Py_DECREF(s);
12609 Py_XDECREF(sep);
12610 return result;
12611}
12612
12613PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012615\n\
12616Return a list of the words in S, using sep as the\n\
12617delimiter string, starting at the end of the string and\n\
12618working to the front. If maxsplit is given, at most maxsplit\n\
12619splits are done. If sep is not specified, any whitespace string\n\
12620is a separator.");
12621
12622static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012623unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012624{
12625 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012626 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012627
Martin v. Löwis18e16552006-02-15 17:27:45 +000012628 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012629 return NULL;
12630
12631 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012632 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012633 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012634 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012635 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012636 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012637}
12638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012639PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641\n\
12642Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012643Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012644is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645
12646static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012647unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012649 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012650 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012652 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12653 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654 return NULL;
12655
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012656 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657}
12658
12659static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012660PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012662 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663}
12664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012665PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667\n\
12668Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012669and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670
12671static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012672unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012674 if (PyUnicode_READY(self) == -1)
12675 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012676 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677}
12678
Georg Brandlceee0772007-11-27 23:48:05 +000012679PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012680 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012681\n\
12682Return a translation table usable for str.translate().\n\
12683If there is only one argument, it must be a dictionary mapping Unicode\n\
12684ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012685Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012686If there are two arguments, they must be strings of equal length, and\n\
12687in the resulting dictionary, each character in x will be mapped to the\n\
12688character at the same position in y. If there is a third argument, it\n\
12689must be a string, whose characters will be mapped to None in the result.");
12690
12691static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012692unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012693{
12694 PyObject *x, *y = NULL, *z = NULL;
12695 PyObject *new = NULL, *key, *value;
12696 Py_ssize_t i = 0;
12697 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012698
Georg Brandlceee0772007-11-27 23:48:05 +000012699 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12700 return NULL;
12701 new = PyDict_New();
12702 if (!new)
12703 return NULL;
12704 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 int x_kind, y_kind, z_kind;
12706 void *x_data, *y_data, *z_data;
12707
Georg Brandlceee0772007-11-27 23:48:05 +000012708 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012709 if (!PyUnicode_Check(x)) {
12710 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12711 "be a string if there is a second argument");
12712 goto err;
12713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012715 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12716 "arguments must have equal length");
12717 goto err;
12718 }
12719 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 x_kind = PyUnicode_KIND(x);
12721 y_kind = PyUnicode_KIND(y);
12722 x_data = PyUnicode_DATA(x);
12723 y_data = PyUnicode_DATA(y);
12724 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12725 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012726 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012727 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012728 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012729 if (!value) {
12730 Py_DECREF(key);
12731 goto err;
12732 }
Georg Brandlceee0772007-11-27 23:48:05 +000012733 res = PyDict_SetItem(new, key, value);
12734 Py_DECREF(key);
12735 Py_DECREF(value);
12736 if (res < 0)
12737 goto err;
12738 }
12739 /* create entries for deleting chars in z */
12740 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 z_kind = PyUnicode_KIND(z);
12742 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012743 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012745 if (!key)
12746 goto err;
12747 res = PyDict_SetItem(new, key, Py_None);
12748 Py_DECREF(key);
12749 if (res < 0)
12750 goto err;
12751 }
12752 }
12753 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 int kind;
12755 void *data;
12756
Georg Brandlceee0772007-11-27 23:48:05 +000012757 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012758 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012759 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12760 "to maketrans it must be a dict");
12761 goto err;
12762 }
12763 /* copy entries into the new dict, converting string keys to int keys */
12764 while (PyDict_Next(x, &i, &key, &value)) {
12765 if (PyUnicode_Check(key)) {
12766 /* convert string keys to integer keys */
12767 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012768 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012769 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12770 "table must be of length 1");
12771 goto err;
12772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 kind = PyUnicode_KIND(key);
12774 data = PyUnicode_DATA(key);
12775 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012776 if (!newkey)
12777 goto err;
12778 res = PyDict_SetItem(new, newkey, value);
12779 Py_DECREF(newkey);
12780 if (res < 0)
12781 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012782 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012783 /* just keep integer keys */
12784 if (PyDict_SetItem(new, key, value) < 0)
12785 goto err;
12786 } else {
12787 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12788 "be strings or integers");
12789 goto err;
12790 }
12791 }
12792 }
12793 return new;
12794 err:
12795 Py_DECREF(new);
12796 return NULL;
12797}
12798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012799PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801\n\
12802Return a copy of the string S, where all characters have been mapped\n\
12803through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012804Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012805Unmapped characters are left untouched. Characters mapped to None\n\
12806are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807
12808static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812}
12813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012814PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012817Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818
12819static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012820unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012822 if (PyUnicode_READY(self) == -1)
12823 return NULL;
12824 if (PyUnicode_IS_ASCII(self))
12825 return ascii_upper_or_lower(self, 0);
12826 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827}
12828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012829PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012832Pad a numeric string S with zeros on the left, to fill a field\n\
12833of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834
12835static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012836unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012838 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012839 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012840 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 int kind;
12842 void *data;
12843 Py_UCS4 chr;
12844
Martin v. Löwis18e16552006-02-15 17:27:45 +000012845 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846 return NULL;
12847
Benjamin Petersonbac79492012-01-14 13:34:47 -050012848 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850
Victor Stinnerc4b49542011-12-11 22:44:26 +010012851 if (PyUnicode_GET_LENGTH(self) >= width)
12852 return unicode_result_unchanged(self);
12853
12854 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855
12856 u = pad(self, fill, 0, '0');
12857
Walter Dörwald068325e2002-04-15 13:36:47 +000012858 if (u == NULL)
12859 return NULL;
12860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 kind = PyUnicode_KIND(u);
12862 data = PyUnicode_DATA(u);
12863 chr = PyUnicode_READ(kind, data, fill);
12864
12865 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 PyUnicode_WRITE(kind, data, 0, chr);
12868 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869 }
12870
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012871 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012872 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874
12875#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012876static PyObject *
12877unicode__decimal2ascii(PyObject *self)
12878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012880}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881#endif
12882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012883PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012884 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012886Return True if S starts with the specified prefix, False otherwise.\n\
12887With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012888With optional end, stop comparing S at that position.\n\
12889prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890
12891static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012892unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012893 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012895 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012896 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012897 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012898 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012899 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900
Jesus Ceaac451502011-04-20 17:09:23 +020012901 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012902 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012903 if (PyTuple_Check(subobj)) {
12904 Py_ssize_t i;
12905 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012906 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012907 if (substring == NULL)
12908 return NULL;
12909 result = tailmatch(self, substring, start, end, -1);
12910 Py_DECREF(substring);
12911 if (result) {
12912 Py_RETURN_TRUE;
12913 }
12914 }
12915 /* nothing matched */
12916 Py_RETURN_FALSE;
12917 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012918 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012919 if (substring == NULL) {
12920 if (PyErr_ExceptionMatches(PyExc_TypeError))
12921 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12922 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012924 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012925 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012927 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012928}
12929
12930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012931PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012934Return True if S ends with the specified suffix, False otherwise.\n\
12935With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012936With optional end, stop comparing S at that position.\n\
12937suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938
12939static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012940unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012941 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012943 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012944 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012945 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012946 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012947 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948
Jesus Ceaac451502011-04-20 17:09:23 +020012949 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012951 if (PyTuple_Check(subobj)) {
12952 Py_ssize_t i;
12953 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012954 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012955 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012956 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012958 result = tailmatch(self, substring, start, end, +1);
12959 Py_DECREF(substring);
12960 if (result) {
12961 Py_RETURN_TRUE;
12962 }
12963 }
12964 Py_RETURN_FALSE;
12965 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012966 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012967 if (substring == NULL) {
12968 if (PyErr_ExceptionMatches(PyExc_TypeError))
12969 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12970 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012971 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012972 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012973 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012975 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976}
12977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012979
12980PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012982\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012983Return a formatted version of S, using substitutions from args and kwargs.\n\
12984The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012985
Eric Smith27bbca62010-11-04 17:06:58 +000012986PyDoc_STRVAR(format_map__doc__,
12987 "S.format_map(mapping) -> str\n\
12988\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012989Return a formatted version of S, using substitutions from mapping.\n\
12990The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012991
Eric Smith4a7d76d2008-05-30 18:10:19 +000012992static PyObject *
12993unicode__format__(PyObject* self, PyObject* args)
12994{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012995 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012996
12997 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12998 return NULL;
12999
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013000 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013002 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013003}
13004
Eric Smith8c663262007-08-25 02:26:07 +000013005PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013007\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013008Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013009
13010static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013011unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 Py_ssize_t size;
13014
13015 /* If it's a compact object, account for base structure +
13016 character data. */
13017 if (PyUnicode_IS_COMPACT_ASCII(v))
13018 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13019 else if (PyUnicode_IS_COMPACT(v))
13020 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013021 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022 else {
13023 /* If it is a two-block object, account for base object, and
13024 for character block if present. */
13025 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013026 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013028 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 }
13030 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013031 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013032 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013034 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013035 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036
13037 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013038}
13039
13040PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013041 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013042
13043static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013044unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013045{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013046 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 if (!copy)
13048 return NULL;
13049 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013050}
13051
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052static PyMethodDef unicode_methods[] = {
13053
13054 /* Order is according to common usage: often used methods should
13055 appear first, since lookup is done sequentially. */
13056
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013057 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013058 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13059 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013060 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013061 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13062 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013063 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013064 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13065 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13066 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13067 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13068 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013069 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013070 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13071 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13072 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013073 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013074 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13075 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13076 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013077 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013078 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013079 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013080 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013081 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13082 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13083 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13084 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13085 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13086 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13087 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13088 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13089 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13090 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13091 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13092 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13093 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13094 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013095 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013096 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013097 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013098 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013099 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013100 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013101 {"maketrans", (PyCFunction) unicode_maketrans,
13102 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013103 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013104#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013105 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013106 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107#endif
13108
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 {NULL, NULL}
13111};
13112
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013113static PyObject *
13114unicode_mod(PyObject *v, PyObject *w)
13115{
Brian Curtindfc80e32011-08-10 20:28:54 -050013116 if (!PyUnicode_Check(v))
13117 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013118 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013119}
13120
13121static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122 0, /*nb_add*/
13123 0, /*nb_subtract*/
13124 0, /*nb_multiply*/
13125 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013126};
13127
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013129 (lenfunc) unicode_length, /* sq_length */
13130 PyUnicode_Concat, /* sq_concat */
13131 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13132 (ssizeargfunc) unicode_getitem, /* sq_item */
13133 0, /* sq_slice */
13134 0, /* sq_ass_item */
13135 0, /* sq_ass_slice */
13136 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137};
13138
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013139static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013140unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 if (PyUnicode_READY(self) == -1)
13143 return NULL;
13144
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013145 if (PyIndex_Check(item)) {
13146 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013147 if (i == -1 && PyErr_Occurred())
13148 return NULL;
13149 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013151 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013152 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013153 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013154 PyObject *result;
13155 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013156 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013157 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013161 return NULL;
13162 }
13163
13164 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013165 Py_INCREF(unicode_empty);
13166 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013168 slicelength == PyUnicode_GET_LENGTH(self)) {
13169 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013170 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013171 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013172 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013173 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013174 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013175 src_kind = PyUnicode_KIND(self);
13176 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013177 if (!PyUnicode_IS_ASCII(self)) {
13178 kind_limit = kind_maxchar_limit(src_kind);
13179 max_char = 0;
13180 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13181 ch = PyUnicode_READ(src_kind, src_data, cur);
13182 if (ch > max_char) {
13183 max_char = ch;
13184 if (max_char >= kind_limit)
13185 break;
13186 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013187 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013188 }
Victor Stinner55c99112011-10-13 01:17:06 +020013189 else
13190 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013191 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013192 if (result == NULL)
13193 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013194 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013195 dest_data = PyUnicode_DATA(result);
13196
13197 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013198 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13199 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013200 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013201 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013202 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013203 } else {
13204 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13205 return NULL;
13206 }
13207}
13208
13209static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013210 (lenfunc)unicode_length, /* mp_length */
13211 (binaryfunc)unicode_subscript, /* mp_subscript */
13212 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013213};
13214
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215
Guido van Rossumd57fd912000-03-10 22:53:23 +000013216/* Helpers for PyUnicode_Format() */
13217
13218static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013219getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013221 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013223 (*p_argidx)++;
13224 if (arglen < 0)
13225 return args;
13226 else
13227 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228 }
13229 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013230 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231 return NULL;
13232}
13233
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013234/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013236static PyObject *
13237formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013239 char *p;
13240 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013242
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243 x = PyFloat_AsDouble(v);
13244 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013245 return NULL;
13246
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013249
Eric Smith0923d1d2009-04-16 20:16:10 +000013250 p = PyOS_double_to_string(x, type, prec,
13251 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013252 if (p == NULL)
13253 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013255 PyMem_Free(p);
13256 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257}
13258
Tim Peters38fd5b62000-09-21 05:43:11 +000013259static PyObject*
13260formatlong(PyObject *val, int flags, int prec, int type)
13261{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013262 char *buf;
13263 int len;
13264 PyObject *str; /* temporary string object. */
13265 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013266
Benjamin Peterson14339b62009-01-31 16:36:08 +000013267 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13268 if (!str)
13269 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013271 Py_DECREF(str);
13272 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013273}
13274
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013275static Py_UCS4
13276formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013278 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013279 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013281 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 goto onError;
13284 }
13285 else {
13286 /* Integer input truncated to a character */
13287 long x;
13288 x = PyLong_AsLong(v);
13289 if (x == -1 && PyErr_Occurred())
13290 goto onError;
13291
Victor Stinner8faf8212011-12-08 22:14:11 +010013292 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 PyErr_SetString(PyExc_OverflowError,
13294 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013295 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013296 }
13297
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013298 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013299 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013300
Benjamin Peterson29060642009-01-31 22:14:21 +000013301 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013302 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013304 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305}
13306
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013307static int
13308repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13309{
13310 int r;
13311 assert(count > 0);
13312 assert(PyUnicode_Check(obj));
13313 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013314 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013315 if (repeated == NULL)
13316 return -1;
13317 r = _PyAccu_Accumulate(acc, repeated);
13318 Py_DECREF(repeated);
13319 return r;
13320 }
13321 else {
13322 do {
13323 if (_PyAccu_Accumulate(acc, obj))
13324 return -1;
13325 } while (--count);
13326 return 0;
13327 }
13328}
13329
Alexander Belopolsky40018472011-02-26 01:02:56 +000013330PyObject *
13331PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 void *fmt;
13334 int fmtkind;
13335 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013337 int r;
13338 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013341 PyObject *temp = NULL;
13342 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013343 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013344 _PyAccu acc;
13345 static PyObject *plus, *minus, *blank, *zero, *percent;
13346
13347 if (!plus && !(plus = get_latin1_char('+')))
13348 return NULL;
13349 if (!minus && !(minus = get_latin1_char('-')))
13350 return NULL;
13351 if (!blank && !(blank = get_latin1_char(' ')))
13352 return NULL;
13353 if (!zero && !(zero = get_latin1_char('0')))
13354 return NULL;
13355 if (!percent && !(percent = get_latin1_char('%')))
13356 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013357
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 PyErr_BadInternalCall();
13360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013362 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013363 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013364 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013365 if (PyUnicode_READY(uformat) == -1)
13366 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013367 if (_PyAccu_Init(&acc))
13368 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013369 fmt = PyUnicode_DATA(uformat);
13370 fmtkind = PyUnicode_KIND(uformat);
13371 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13372 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 arglen = PyTuple_Size(args);
13376 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377 }
13378 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013379 arglen = -1;
13380 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013382 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013383 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013385
13386 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013387 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013388 PyObject *nonfmt;
13389 Py_ssize_t nonfmtpos;
13390 nonfmtpos = fmtpos++;
13391 while (fmtcnt >= 0 &&
13392 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13393 fmtpos++;
13394 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013395 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013396 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013397 if (nonfmt == NULL)
13398 goto onError;
13399 r = _PyAccu_Accumulate(&acc, nonfmt);
13400 Py_DECREF(nonfmt);
13401 if (r)
13402 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013403 }
13404 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 /* Got a format specifier */
13406 int flags = 0;
13407 Py_ssize_t width = -1;
13408 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013409 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013410 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 int isnumok;
13412 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013413 void *pbuf = NULL;
13414 Py_ssize_t pindex, len;
13415 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013417 fmtpos++;
13418 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13419 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 Py_ssize_t keylen;
13421 PyObject *key;
13422 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013423
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 if (dict == NULL) {
13425 PyErr_SetString(PyExc_TypeError,
13426 "format requires a mapping");
13427 goto onError;
13428 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013429 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013431 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013432 /* Skip over balanced parentheses */
13433 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013434 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013435 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013436 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013438 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013440 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013441 if (fmtcnt < 0 || pcount > 0) {
13442 PyErr_SetString(PyExc_ValueError,
13443 "incomplete format key");
13444 goto onError;
13445 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013446 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013447 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 if (key == NULL)
13449 goto onError;
13450 if (args_owned) {
13451 Py_DECREF(args);
13452 args_owned = 0;
13453 }
13454 args = PyObject_GetItem(dict, key);
13455 Py_DECREF(key);
13456 if (args == NULL) {
13457 goto onError;
13458 }
13459 args_owned = 1;
13460 arglen = -1;
13461 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013462 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013463 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013464 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013465 case '-': flags |= F_LJUST; continue;
13466 case '+': flags |= F_SIGN; continue;
13467 case ' ': flags |= F_BLANK; continue;
13468 case '#': flags |= F_ALT; continue;
13469 case '0': flags |= F_ZERO; continue;
13470 }
13471 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013472 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 if (c == '*') {
13474 v = getnextarg(args, arglen, &argidx);
13475 if (v == NULL)
13476 goto onError;
13477 if (!PyLong_Check(v)) {
13478 PyErr_SetString(PyExc_TypeError,
13479 "* wants int");
13480 goto onError;
13481 }
13482 width = PyLong_AsLong(v);
13483 if (width == -1 && PyErr_Occurred())
13484 goto onError;
13485 if (width < 0) {
13486 flags |= F_LJUST;
13487 width = -width;
13488 }
13489 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013490 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013491 }
13492 else if (c >= '0' && c <= '9') {
13493 width = c - '0';
13494 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 if (c < '0' || c > '9')
13497 break;
13498 if ((width*10) / 10 != width) {
13499 PyErr_SetString(PyExc_ValueError,
13500 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013501 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 }
13503 width = width*10 + (c - '0');
13504 }
13505 }
13506 if (c == '.') {
13507 prec = 0;
13508 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013509 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 if (c == '*') {
13511 v = getnextarg(args, arglen, &argidx);
13512 if (v == NULL)
13513 goto onError;
13514 if (!PyLong_Check(v)) {
13515 PyErr_SetString(PyExc_TypeError,
13516 "* wants int");
13517 goto onError;
13518 }
13519 prec = PyLong_AsLong(v);
13520 if (prec == -1 && PyErr_Occurred())
13521 goto onError;
13522 if (prec < 0)
13523 prec = 0;
13524 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013525 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013526 }
13527 else if (c >= '0' && c <= '9') {
13528 prec = c - '0';
13529 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013530 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 if (c < '0' || c > '9')
13532 break;
13533 if ((prec*10) / 10 != prec) {
13534 PyErr_SetString(PyExc_ValueError,
13535 "prec too big");
13536 goto onError;
13537 }
13538 prec = prec*10 + (c - '0');
13539 }
13540 }
13541 } /* prec */
13542 if (fmtcnt >= 0) {
13543 if (c == 'h' || c == 'l' || c == 'L') {
13544 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013545 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 }
13547 }
13548 if (fmtcnt < 0) {
13549 PyErr_SetString(PyExc_ValueError,
13550 "incomplete format");
13551 goto onError;
13552 }
13553 if (c != '%') {
13554 v = getnextarg(args, arglen, &argidx);
13555 if (v == NULL)
13556 goto onError;
13557 }
13558 sign = 0;
13559 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013560 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013561 switch (c) {
13562
13563 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013564 _PyAccu_Accumulate(&acc, percent);
13565 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013566
13567 case 's':
13568 case 'r':
13569 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013570 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 temp = v;
13572 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013573 }
13574 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 if (c == 's')
13576 temp = PyObject_Str(v);
13577 else if (c == 'r')
13578 temp = PyObject_Repr(v);
13579 else
13580 temp = PyObject_ASCII(v);
13581 if (temp == NULL)
13582 goto onError;
13583 if (PyUnicode_Check(temp))
13584 /* nothing to do */;
13585 else {
13586 Py_DECREF(temp);
13587 PyErr_SetString(PyExc_TypeError,
13588 "%s argument has non-string str()");
13589 goto onError;
13590 }
13591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013592 if (PyUnicode_READY(temp) == -1) {
13593 Py_CLEAR(temp);
13594 goto onError;
13595 }
13596 pbuf = PyUnicode_DATA(temp);
13597 kind = PyUnicode_KIND(temp);
13598 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013599 if (prec >= 0 && len > prec)
13600 len = prec;
13601 break;
13602
13603 case 'i':
13604 case 'd':
13605 case 'u':
13606 case 'o':
13607 case 'x':
13608 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 isnumok = 0;
13610 if (PyNumber_Check(v)) {
13611 PyObject *iobj=NULL;
13612
13613 if (PyLong_Check(v)) {
13614 iobj = v;
13615 Py_INCREF(iobj);
13616 }
13617 else {
13618 iobj = PyNumber_Long(v);
13619 }
13620 if (iobj!=NULL) {
13621 if (PyLong_Check(iobj)) {
13622 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013623 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 Py_DECREF(iobj);
13625 if (!temp)
13626 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013627 if (PyUnicode_READY(temp) == -1) {
13628 Py_CLEAR(temp);
13629 goto onError;
13630 }
13631 pbuf = PyUnicode_DATA(temp);
13632 kind = PyUnicode_KIND(temp);
13633 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013634 sign = 1;
13635 }
13636 else {
13637 Py_DECREF(iobj);
13638 }
13639 }
13640 }
13641 if (!isnumok) {
13642 PyErr_Format(PyExc_TypeError,
13643 "%%%c format: a number is required, "
13644 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13645 goto onError;
13646 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013647 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013648 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013649 fillobj = zero;
13650 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013651 break;
13652
13653 case 'e':
13654 case 'E':
13655 case 'f':
13656 case 'F':
13657 case 'g':
13658 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013659 temp = formatfloat(v, flags, prec, c);
13660 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013661 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013662 if (PyUnicode_READY(temp) == -1) {
13663 Py_CLEAR(temp);
13664 goto onError;
13665 }
13666 pbuf = PyUnicode_DATA(temp);
13667 kind = PyUnicode_KIND(temp);
13668 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013669 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013670 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013672 fillobj = zero;
13673 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013674 break;
13675
13676 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013677 {
13678 Py_UCS4 ch = formatchar(v);
13679 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013680 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013681 temp = _PyUnicode_FromUCS4(&ch, 1);
13682 if (temp == NULL)
13683 goto onError;
13684 pbuf = PyUnicode_DATA(temp);
13685 kind = PyUnicode_KIND(temp);
13686 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013687 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013688 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013689
13690 default:
13691 PyErr_Format(PyExc_ValueError,
13692 "unsupported format character '%c' (0x%x) "
13693 "at index %zd",
13694 (31<=c && c<=126) ? (char)c : '?',
13695 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013696 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 goto onError;
13698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013699 /* pbuf is initialized here. */
13700 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013701 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013702 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13703 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013704 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013705 pindex++;
13706 }
13707 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13708 signobj = plus;
13709 len--;
13710 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013711 }
13712 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013713 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013715 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013716 else
13717 sign = 0;
13718 }
13719 if (width < len)
13720 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013722 if (fill != ' ') {
13723 assert(signobj != NULL);
13724 if (_PyAccu_Accumulate(&acc, signobj))
13725 goto onError;
13726 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 if (width > len)
13728 width--;
13729 }
13730 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013731 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013732 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013734 second = get_latin1_char(
13735 PyUnicode_READ(kind, pbuf, pindex + 1));
13736 pindex += 2;
13737 if (second == NULL ||
13738 _PyAccu_Accumulate(&acc, zero) ||
13739 _PyAccu_Accumulate(&acc, second))
13740 goto onError;
13741 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013742 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 width -= 2;
13744 if (width < 0)
13745 width = 0;
13746 len -= 2;
13747 }
13748 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013749 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013750 if (repeat_accumulate(&acc, fillobj, width - len))
13751 goto onError;
13752 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013753 }
13754 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013755 if (sign) {
13756 assert(signobj != NULL);
13757 if (_PyAccu_Accumulate(&acc, signobj))
13758 goto onError;
13759 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013760 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013761 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13762 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013763 second = get_latin1_char(
13764 PyUnicode_READ(kind, pbuf, pindex + 1));
13765 pindex += 2;
13766 if (second == NULL ||
13767 _PyAccu_Accumulate(&acc, zero) ||
13768 _PyAccu_Accumulate(&acc, second))
13769 goto onError;
13770 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013771 }
13772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013773 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013774 if (temp != NULL) {
13775 assert(pbuf == PyUnicode_DATA(temp));
13776 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013777 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013778 else {
13779 const char *p = (const char *) pbuf;
13780 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013781 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013782 v = PyUnicode_FromKindAndData(kind, p, len);
13783 }
13784 if (v == NULL)
13785 goto onError;
13786 r = _PyAccu_Accumulate(&acc, v);
13787 Py_DECREF(v);
13788 if (r)
13789 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013790 if (width > len && repeat_accumulate(&acc, blank, width - len))
13791 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 if (dict && (argidx < arglen) && c != '%') {
13793 PyErr_SetString(PyExc_TypeError,
13794 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 goto onError;
13796 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013797 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013798 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013799 } /* until end */
13800 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013801 PyErr_SetString(PyExc_TypeError,
13802 "not all arguments converted during string formatting");
13803 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804 }
13805
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013806 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013809 }
13810 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013811 Py_XDECREF(temp);
13812 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013813 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013814
Benjamin Peterson29060642009-01-31 22:14:21 +000013815 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013816 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013817 Py_XDECREF(temp);
13818 Py_XDECREF(second);
13819 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822 }
13823 return NULL;
13824}
13825
Jeremy Hylton938ace62002-07-17 16:30:39 +000013826static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013827unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13828
Tim Peters6d6c1a32001-08-02 04:15:00 +000013829static PyObject *
13830unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13831{
Benjamin Peterson29060642009-01-31 22:14:21 +000013832 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013833 static char *kwlist[] = {"object", "encoding", "errors", 0};
13834 char *encoding = NULL;
13835 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013836
Benjamin Peterson14339b62009-01-31 16:36:08 +000013837 if (type != &PyUnicode_Type)
13838 return unicode_subtype_new(type, args, kwds);
13839 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013841 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013842 if (x == NULL) {
13843 Py_INCREF(unicode_empty);
13844 return unicode_empty;
13845 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013846 if (encoding == NULL && errors == NULL)
13847 return PyObject_Str(x);
13848 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013849 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013850}
13851
Guido van Rossume023fe02001-08-30 03:12:59 +000013852static PyObject *
13853unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13854{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013855 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013856 Py_ssize_t length, char_size;
13857 int share_wstr, share_utf8;
13858 unsigned int kind;
13859 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013860
Benjamin Peterson14339b62009-01-31 16:36:08 +000013861 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013862
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013863 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013864 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013865 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013866 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013867 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013868 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013869 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013870 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013871
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013872 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013873 if (self == NULL) {
13874 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013875 return NULL;
13876 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013877 kind = PyUnicode_KIND(unicode);
13878 length = PyUnicode_GET_LENGTH(unicode);
13879
13880 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013881#ifdef Py_DEBUG
13882 _PyUnicode_HASH(self) = -1;
13883#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013884 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013885#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013886 _PyUnicode_STATE(self).interned = 0;
13887 _PyUnicode_STATE(self).kind = kind;
13888 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013889 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013890 _PyUnicode_STATE(self).ready = 1;
13891 _PyUnicode_WSTR(self) = NULL;
13892 _PyUnicode_UTF8_LENGTH(self) = 0;
13893 _PyUnicode_UTF8(self) = NULL;
13894 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013895 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013896
13897 share_utf8 = 0;
13898 share_wstr = 0;
13899 if (kind == PyUnicode_1BYTE_KIND) {
13900 char_size = 1;
13901 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13902 share_utf8 = 1;
13903 }
13904 else if (kind == PyUnicode_2BYTE_KIND) {
13905 char_size = 2;
13906 if (sizeof(wchar_t) == 2)
13907 share_wstr = 1;
13908 }
13909 else {
13910 assert(kind == PyUnicode_4BYTE_KIND);
13911 char_size = 4;
13912 if (sizeof(wchar_t) == 4)
13913 share_wstr = 1;
13914 }
13915
13916 /* Ensure we won't overflow the length. */
13917 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13918 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013919 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013920 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013921 data = PyObject_MALLOC((length + 1) * char_size);
13922 if (data == NULL) {
13923 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013924 goto onError;
13925 }
13926
Victor Stinnerc3c74152011-10-02 20:39:55 +020013927 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013928 if (share_utf8) {
13929 _PyUnicode_UTF8_LENGTH(self) = length;
13930 _PyUnicode_UTF8(self) = data;
13931 }
13932 if (share_wstr) {
13933 _PyUnicode_WSTR_LENGTH(self) = length;
13934 _PyUnicode_WSTR(self) = (wchar_t *)data;
13935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013936
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013937 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013938 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013939 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013940#ifdef Py_DEBUG
13941 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13942#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013943 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013944 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013945
13946onError:
13947 Py_DECREF(unicode);
13948 Py_DECREF(self);
13949 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013950}
13951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013952PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013953 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013954\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013955Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013956encoding defaults to the current default string encoding.\n\
13957errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013958
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013959static PyObject *unicode_iter(PyObject *seq);
13960
Guido van Rossumd57fd912000-03-10 22:53:23 +000013961PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013962 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013963 "str", /* tp_name */
13964 sizeof(PyUnicodeObject), /* tp_size */
13965 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013966 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013967 (destructor)unicode_dealloc, /* tp_dealloc */
13968 0, /* tp_print */
13969 0, /* tp_getattr */
13970 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013971 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 unicode_repr, /* tp_repr */
13973 &unicode_as_number, /* tp_as_number */
13974 &unicode_as_sequence, /* tp_as_sequence */
13975 &unicode_as_mapping, /* tp_as_mapping */
13976 (hashfunc) unicode_hash, /* tp_hash*/
13977 0, /* tp_call*/
13978 (reprfunc) unicode_str, /* tp_str */
13979 PyObject_GenericGetAttr, /* tp_getattro */
13980 0, /* tp_setattro */
13981 0, /* tp_as_buffer */
13982 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013983 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013984 unicode_doc, /* tp_doc */
13985 0, /* tp_traverse */
13986 0, /* tp_clear */
13987 PyUnicode_RichCompare, /* tp_richcompare */
13988 0, /* tp_weaklistoffset */
13989 unicode_iter, /* tp_iter */
13990 0, /* tp_iternext */
13991 unicode_methods, /* tp_methods */
13992 0, /* tp_members */
13993 0, /* tp_getset */
13994 &PyBaseObject_Type, /* tp_base */
13995 0, /* tp_dict */
13996 0, /* tp_descr_get */
13997 0, /* tp_descr_set */
13998 0, /* tp_dictoffset */
13999 0, /* tp_init */
14000 0, /* tp_alloc */
14001 unicode_new, /* tp_new */
14002 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014003};
14004
14005/* Initialize the Unicode implementation */
14006
Victor Stinner3a50e702011-10-18 21:21:00 +020014007int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014008{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014009 int i;
14010
Thomas Wouters477c8d52006-05-27 19:21:47 +000014011 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014012 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014013 0x000A, /* LINE FEED */
14014 0x000D, /* CARRIAGE RETURN */
14015 0x001C, /* FILE SEPARATOR */
14016 0x001D, /* GROUP SEPARATOR */
14017 0x001E, /* RECORD SEPARATOR */
14018 0x0085, /* NEXT LINE */
14019 0x2028, /* LINE SEPARATOR */
14020 0x2029, /* PARAGRAPH SEPARATOR */
14021 };
14022
Fred Drakee4315f52000-05-09 19:53:39 +000014023 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014024 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014025 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014026 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014027 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014028
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014029 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014030 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014031 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014032 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014033
14034 /* initialize the linebreak bloom filter */
14035 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014036 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014037 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014038
14039 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014040
14041#ifdef HAVE_MBCS
14042 winver.dwOSVersionInfoSize = sizeof(winver);
14043 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14044 PyErr_SetFromWindowsErr(0);
14045 return -1;
14046 }
14047#endif
14048 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014049}
14050
14051/* Finalize the Unicode implementation */
14052
Christian Heimesa156e092008-02-16 07:38:31 +000014053int
14054PyUnicode_ClearFreeList(void)
14055{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014056 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014057}
14058
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059void
Thomas Wouters78890102000-07-22 19:25:51 +000014060_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014061{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014062 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014063
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014064 Py_XDECREF(unicode_empty);
14065 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014066
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014067 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014068 if (unicode_latin1[i]) {
14069 Py_DECREF(unicode_latin1[i]);
14070 unicode_latin1[i] = NULL;
14071 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014072 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014073 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014074 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014075}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014076
Walter Dörwald16807132007-05-25 13:52:07 +000014077void
14078PyUnicode_InternInPlace(PyObject **p)
14079{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014080 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014081 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014082#ifdef Py_DEBUG
14083 assert(s != NULL);
14084 assert(_PyUnicode_CHECK(s));
14085#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014087 return;
14088#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014089 /* If it's a subclass, we don't really know what putting
14090 it in the interned dict might do. */
14091 if (!PyUnicode_CheckExact(s))
14092 return;
14093 if (PyUnicode_CHECK_INTERNED(s))
14094 return;
14095 if (interned == NULL) {
14096 interned = PyDict_New();
14097 if (interned == NULL) {
14098 PyErr_Clear(); /* Don't leave an exception */
14099 return;
14100 }
14101 }
14102 /* It might be that the GetItem call fails even
14103 though the key is present in the dictionary,
14104 namely when this happens during a stack overflow. */
14105 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014106 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014107 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014108
Benjamin Peterson29060642009-01-31 22:14:21 +000014109 if (t) {
14110 Py_INCREF(t);
14111 Py_DECREF(*p);
14112 *p = t;
14113 return;
14114 }
Walter Dörwald16807132007-05-25 13:52:07 +000014115
Benjamin Peterson14339b62009-01-31 16:36:08 +000014116 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014117 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014118 PyErr_Clear();
14119 PyThreadState_GET()->recursion_critical = 0;
14120 return;
14121 }
14122 PyThreadState_GET()->recursion_critical = 0;
14123 /* The two references in interned are not counted by refcnt.
14124 The deallocator will take care of this */
14125 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014126 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014127}
14128
14129void
14130PyUnicode_InternImmortal(PyObject **p)
14131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014132 PyUnicode_InternInPlace(p);
14133 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014134 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014135 Py_INCREF(*p);
14136 }
Walter Dörwald16807132007-05-25 13:52:07 +000014137}
14138
14139PyObject *
14140PyUnicode_InternFromString(const char *cp)
14141{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 PyObject *s = PyUnicode_FromString(cp);
14143 if (s == NULL)
14144 return NULL;
14145 PyUnicode_InternInPlace(&s);
14146 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014147}
14148
Alexander Belopolsky40018472011-02-26 01:02:56 +000014149void
14150_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014153 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014154 Py_ssize_t i, n;
14155 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014156
Benjamin Peterson14339b62009-01-31 16:36:08 +000014157 if (interned == NULL || !PyDict_Check(interned))
14158 return;
14159 keys = PyDict_Keys(interned);
14160 if (keys == NULL || !PyList_Check(keys)) {
14161 PyErr_Clear();
14162 return;
14163 }
Walter Dörwald16807132007-05-25 13:52:07 +000014164
Benjamin Peterson14339b62009-01-31 16:36:08 +000014165 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14166 detector, interned unicode strings are not forcibly deallocated;
14167 rather, we give them their stolen references back, and then clear
14168 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014169
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 n = PyList_GET_SIZE(keys);
14171 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014172 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014173 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014174 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014175 if (PyUnicode_READY(s) == -1) {
14176 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014177 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014179 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 case SSTATE_NOT_INTERNED:
14181 /* XXX Shouldn't happen */
14182 break;
14183 case SSTATE_INTERNED_IMMORTAL:
14184 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014185 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014186 break;
14187 case SSTATE_INTERNED_MORTAL:
14188 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014189 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014190 break;
14191 default:
14192 Py_FatalError("Inconsistent interned string state.");
14193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014194 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014195 }
14196 fprintf(stderr, "total size of all interned strings: "
14197 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14198 "mortal/immortal\n", mortal_size, immortal_size);
14199 Py_DECREF(keys);
14200 PyDict_Clear(interned);
14201 Py_DECREF(interned);
14202 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014203}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014204
14205
14206/********************* Unicode Iterator **************************/
14207
14208typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014209 PyObject_HEAD
14210 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014211 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014212} unicodeiterobject;
14213
14214static void
14215unicodeiter_dealloc(unicodeiterobject *it)
14216{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014217 _PyObject_GC_UNTRACK(it);
14218 Py_XDECREF(it->it_seq);
14219 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014220}
14221
14222static int
14223unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14224{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014225 Py_VISIT(it->it_seq);
14226 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014227}
14228
14229static PyObject *
14230unicodeiter_next(unicodeiterobject *it)
14231{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014232 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014233
Benjamin Peterson14339b62009-01-31 16:36:08 +000014234 assert(it != NULL);
14235 seq = it->it_seq;
14236 if (seq == NULL)
14237 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014238 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014240 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14241 int kind = PyUnicode_KIND(seq);
14242 void *data = PyUnicode_DATA(seq);
14243 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14244 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014245 if (item != NULL)
14246 ++it->it_index;
14247 return item;
14248 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014249
Benjamin Peterson14339b62009-01-31 16:36:08 +000014250 Py_DECREF(seq);
14251 it->it_seq = NULL;
14252 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014253}
14254
14255static PyObject *
14256unicodeiter_len(unicodeiterobject *it)
14257{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014258 Py_ssize_t len = 0;
14259 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014260 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014261 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014262}
14263
14264PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14265
14266static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014267 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014268 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014269 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014270};
14271
14272PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014273 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14274 "str_iterator", /* tp_name */
14275 sizeof(unicodeiterobject), /* tp_basicsize */
14276 0, /* tp_itemsize */
14277 /* methods */
14278 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14279 0, /* tp_print */
14280 0, /* tp_getattr */
14281 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014282 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014283 0, /* tp_repr */
14284 0, /* tp_as_number */
14285 0, /* tp_as_sequence */
14286 0, /* tp_as_mapping */
14287 0, /* tp_hash */
14288 0, /* tp_call */
14289 0, /* tp_str */
14290 PyObject_GenericGetAttr, /* tp_getattro */
14291 0, /* tp_setattro */
14292 0, /* tp_as_buffer */
14293 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14294 0, /* tp_doc */
14295 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14296 0, /* tp_clear */
14297 0, /* tp_richcompare */
14298 0, /* tp_weaklistoffset */
14299 PyObject_SelfIter, /* tp_iter */
14300 (iternextfunc)unicodeiter_next, /* tp_iternext */
14301 unicodeiter_methods, /* tp_methods */
14302 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014303};
14304
14305static PyObject *
14306unicode_iter(PyObject *seq)
14307{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014308 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014309
Benjamin Peterson14339b62009-01-31 16:36:08 +000014310 if (!PyUnicode_Check(seq)) {
14311 PyErr_BadInternalCall();
14312 return NULL;
14313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014314 if (PyUnicode_READY(seq) == -1)
14315 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014316 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14317 if (it == NULL)
14318 return NULL;
14319 it->it_index = 0;
14320 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014321 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014322 _PyObject_GC_TRACK(it);
14323 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014324}
14325
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014326
14327size_t
14328Py_UNICODE_strlen(const Py_UNICODE *u)
14329{
14330 int res = 0;
14331 while(*u++)
14332 res++;
14333 return res;
14334}
14335
14336Py_UNICODE*
14337Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14338{
14339 Py_UNICODE *u = s1;
14340 while ((*u++ = *s2++));
14341 return s1;
14342}
14343
14344Py_UNICODE*
14345Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14346{
14347 Py_UNICODE *u = s1;
14348 while ((*u++ = *s2++))
14349 if (n-- == 0)
14350 break;
14351 return s1;
14352}
14353
14354Py_UNICODE*
14355Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14356{
14357 Py_UNICODE *u1 = s1;
14358 u1 += Py_UNICODE_strlen(u1);
14359 Py_UNICODE_strcpy(u1, s2);
14360 return s1;
14361}
14362
14363int
14364Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14365{
14366 while (*s1 && *s2 && *s1 == *s2)
14367 s1++, s2++;
14368 if (*s1 && *s2)
14369 return (*s1 < *s2) ? -1 : +1;
14370 if (*s1)
14371 return 1;
14372 if (*s2)
14373 return -1;
14374 return 0;
14375}
14376
14377int
14378Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14379{
14380 register Py_UNICODE u1, u2;
14381 for (; n != 0; n--) {
14382 u1 = *s1;
14383 u2 = *s2;
14384 if (u1 != u2)
14385 return (u1 < u2) ? -1 : +1;
14386 if (u1 == '\0')
14387 return 0;
14388 s1++;
14389 s2++;
14390 }
14391 return 0;
14392}
14393
14394Py_UNICODE*
14395Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14396{
14397 const Py_UNICODE *p;
14398 for (p = s; *p; p++)
14399 if (*p == c)
14400 return (Py_UNICODE*)p;
14401 return NULL;
14402}
14403
14404Py_UNICODE*
14405Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14406{
14407 const Py_UNICODE *p;
14408 p = s + Py_UNICODE_strlen(s);
14409 while (p != s) {
14410 p--;
14411 if (*p == c)
14412 return (Py_UNICODE*)p;
14413 }
14414 return NULL;
14415}
Victor Stinner331ea922010-08-10 16:37:20 +000014416
Victor Stinner71133ff2010-09-01 23:43:53 +000014417Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014418PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014419{
Victor Stinner577db2c2011-10-11 22:12:48 +020014420 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014421 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014423 if (!PyUnicode_Check(unicode)) {
14424 PyErr_BadArgument();
14425 return NULL;
14426 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014427 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014428 if (u == NULL)
14429 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014430 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014431 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014432 PyErr_NoMemory();
14433 return NULL;
14434 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014435 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014436 size *= sizeof(Py_UNICODE);
14437 copy = PyMem_Malloc(size);
14438 if (copy == NULL) {
14439 PyErr_NoMemory();
14440 return NULL;
14441 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014442 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014443 return copy;
14444}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014445
Georg Brandl66c221e2010-10-14 07:04:07 +000014446/* A _string module, to export formatter_parser and formatter_field_name_split
14447 to the string.Formatter class implemented in Python. */
14448
14449static PyMethodDef _string_methods[] = {
14450 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14451 METH_O, PyDoc_STR("split the argument as a field name")},
14452 {"formatter_parser", (PyCFunction) formatter_parser,
14453 METH_O, PyDoc_STR("parse the argument as a format string")},
14454 {NULL, NULL}
14455};
14456
14457static struct PyModuleDef _string_module = {
14458 PyModuleDef_HEAD_INIT,
14459 "_string",
14460 PyDoc_STR("string helper module"),
14461 0,
14462 _string_methods,
14463 NULL,
14464 NULL,
14465 NULL,
14466 NULL
14467};
14468
14469PyMODINIT_FUNC
14470PyInit__string(void)
14471{
14472 return PyModule_Create(&_string_module);
14473}
14474
14475
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014476#ifdef __cplusplus
14477}
14478#endif