blob: e97ce1f6e6c5727fca9969e5c9b13245ffb64250 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
Benjamin Petersonbac79492012-01-14 13:34:47 -05001266 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001267 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001268 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001747 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1748 strlen(id->string),
1749 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750 if (!id->object)
1751 return NULL;
1752 PyUnicode_InternInPlace(&id->object);
1753 assert(!id->next);
1754 id->next = static_strings;
1755 static_strings = id;
1756 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001757 return id->object;
1758}
1759
1760void
1761_PyUnicode_ClearStaticStrings()
1762{
1763 _Py_Identifier *i;
1764 for (i = static_strings; i; i = i->next) {
1765 Py_DECREF(i->object);
1766 i->object = NULL;
1767 i->next = NULL;
1768 }
1769}
1770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771/* Internal function, don't check maximum character */
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001775{
Victor Stinner785938e2011-12-11 20:09:03 +01001776 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001777 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001778#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001781 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001782 }
Victor Stinner785938e2011-12-11 20:09:03 +01001783 unicode = PyUnicode_New(size, 127);
1784 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001785 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001786 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1787 assert(_PyUnicode_CheckConsistency(unicode, 1));
1788 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001789}
1790
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001791static Py_UCS4
1792kind_maxchar_limit(unsigned int kind)
1793{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001794 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001795 case PyUnicode_1BYTE_KIND:
1796 return 0x80;
1797 case PyUnicode_2BYTE_KIND:
1798 return 0x100;
1799 case PyUnicode_4BYTE_KIND:
1800 return 0x10000;
1801 default:
1802 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001803 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804 }
1805}
1806
Victor Stinner702c7342011-10-05 13:50:52 +02001807static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001808_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001811 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 if (size == 0) {
1814 Py_INCREF(unicode_empty);
1815 return unicode_empty;
1816 }
1817 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001818 if (size == 1)
1819 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
1825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001826 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001828}
1829
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830static PyObject*
1831_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832{
1833 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 if (size == 0) {
1837 Py_INCREF(unicode_empty);
1838 return unicode_empty;
1839 }
1840 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001841 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001842 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 else {
1851 _PyUnicode_CONVERT_BYTES(
1852 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 return res;
1856}
1857
Victor Stinnere57b1c02011-09-28 22:20:48 +02001858static PyObject*
1859_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860{
1861 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 if (size == 0) {
1865 Py_INCREF(unicode_empty);
1866 return unicode_empty;
1867 }
1868 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001869 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 return get_latin1_char((unsigned char)u[0]);
1871
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001873 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 if (!res)
1875 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001876 if (max_char < 256)
1877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1878 PyUnicode_1BYTE_DATA(res));
1879 else if (max_char < 0x10000)
1880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1881 PyUnicode_2BYTE_DATA(res));
1882 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001884 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 return res;
1886}
1887
1888PyObject*
1889PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1890{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001891 if (size < 0) {
1892 PyErr_SetString(PyExc_ValueError, "size must be positive");
1893 return NULL;
1894 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001895 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001897 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 PyErr_SetString(PyExc_SystemError, "invalid kind");
1904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906}
1907
Victor Stinner25a4b292011-10-06 12:31:55 +02001908/* Ensure that a string uses the most efficient storage, if it is not the
1909 case: create a new string with of the right kind. Write NULL into *p_unicode
1910 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001911static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001912unicode_adjust_maxchar(PyObject **p_unicode)
1913{
1914 PyObject *unicode, *copy;
1915 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001916 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001917 unsigned int kind;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921 assert(PyUnicode_IS_READY(unicode));
1922 if (PyUnicode_IS_ASCII(unicode))
1923 return;
1924
1925 len = PyUnicode_GET_LENGTH(unicode);
1926 kind = PyUnicode_KIND(unicode);
1927 if (kind == PyUnicode_1BYTE_KIND) {
1928 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + len);
1930 if (max_char >= 128)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else if (kind == PyUnicode_2BYTE_KIND) {
1934 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + len);
1936 if (max_char >= 256)
1937 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001938 }
1939 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs4lib_find_max_char(u, u + len);
1943 if (max_char >= 0x10000)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 copy = PyUnicode_New(len, max_char);
1947 copy_characters(copy, 0, unicode, 0, len);
1948 Py_DECREF(unicode);
1949 *p_unicode = copy;
1950}
1951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001953_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954{
Victor Stinner87af4f22011-11-21 23:03:47 +01001955 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001957
Victor Stinner034f6cf2011-09-30 02:26:44 +02001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05001962 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001963 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 length = PyUnicode_GET_LENGTH(unicode);
1966 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 if (!copy)
1968 return NULL;
1969 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1970
Victor Stinner87af4f22011-11-21 23:03:47 +01001971 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1972 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001975}
1976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978/* Widen Unicode objects to larger buffers. Don't write terminating null
1979 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
1981void*
1982_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1983{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001984 Py_ssize_t len;
1985 void *result;
1986 unsigned int skind;
1987
Benjamin Petersonbac79492012-01-14 13:34:47 -05001988 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02001989 return NULL;
1990
1991 len = PyUnicode_GET_LENGTH(s);
1992 skind = PyUnicode_KIND(s);
1993 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001994 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return NULL;
1996 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001997 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 case PyUnicode_2BYTE_KIND:
1999 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2000 if (!result)
2001 return PyErr_NoMemory();
2002 assert(skind == PyUnicode_1BYTE_KIND);
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS1, Py_UCS2,
2005 PyUnicode_1BYTE_DATA(s),
2006 PyUnicode_1BYTE_DATA(s) + len,
2007 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_4BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 if (skind == PyUnicode_2BYTE_KIND) {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS4,
2016 PyUnicode_2BYTE_DATA(s),
2017 PyUnicode_2BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 else {
2021 assert(skind == PyUnicode_1BYTE_KIND);
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS1, Py_UCS4,
2024 PyUnicode_1BYTE_DATA(s),
2025 PyUnicode_1BYTE_DATA(s) + len,
2026 result);
2027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 default:
2030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 }
Victor Stinner01698042011-10-04 00:04:26 +02002032 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return NULL;
2034}
2035
2036static Py_UCS4*
2037as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2038 int copy_null)
2039{
2040 int kind;
2041 void *data;
2042 Py_ssize_t len, targetlen;
2043 if (PyUnicode_READY(string) == -1)
2044 return NULL;
2045 kind = PyUnicode_KIND(string);
2046 data = PyUnicode_DATA(string);
2047 len = PyUnicode_GET_LENGTH(string);
2048 targetlen = len;
2049 if (copy_null)
2050 targetlen++;
2051 if (!target) {
2052 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2053 PyErr_NoMemory();
2054 return NULL;
2055 }
2056 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2057 if (!target) {
2058 PyErr_NoMemory();
2059 return NULL;
2060 }
2061 }
2062 else {
2063 if (targetsize < targetlen) {
2064 PyErr_Format(PyExc_SystemError,
2065 "string is longer than the buffer");
2066 if (copy_null && 0 < targetsize)
2067 target[0] = 0;
2068 return NULL;
2069 }
2070 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002071 if (kind == PyUnicode_1BYTE_KIND) {
2072 Py_UCS1 *start = (Py_UCS1 *) data;
2073 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002075 else if (kind == PyUnicode_2BYTE_KIND) {
2076 Py_UCS2 *start = (Py_UCS2 *) data;
2077 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2078 }
2079 else {
2080 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (copy_null)
2084 target[len] = 0;
2085 return target;
2086}
2087
2088Py_UCS4*
2089PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002092 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
2096 return as_ucs4(string, target, targetsize, copy_null);
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4Copy(PyObject *string)
2101{
2102 return as_ucs4(string, NULL, 0, 1);
2103}
2104
2105#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002106
Alexander Belopolsky40018472011-02-26 01:02:56 +00002107PyObject *
2108PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002111 if (size == 0) {
2112 Py_INCREF(unicode_empty);
2113 return unicode_empty;
2114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_BadInternalCall();
2116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == -1) {
2120 size = wcslen(w);
2121 }
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002127
Walter Dörwald346737f2007-05-31 10:44:43 +00002128static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002129makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2130 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 *fmt++ = '%';
2133 if (width) {
2134 if (zeropad)
2135 *fmt++ = '0';
2136 fmt += sprintf(fmt, "%d", width);
2137 }
2138 if (precision)
2139 fmt += sprintf(fmt, ".%d", precision);
2140 if (longflag)
2141 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142 else if (longlongflag) {
2143 /* longlongflag should only ever be nonzero on machines with
2144 HAVE_LONG_LONG defined */
2145#ifdef HAVE_LONG_LONG
2146 char *f = PY_FORMAT_LONG_LONG;
2147 while (*f)
2148 *fmt++ = *f++;
2149#else
2150 /* we shouldn't ever get here */
2151 assert(0);
2152 *fmt++ = 'l';
2153#endif
2154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 else if (size_tflag) {
2156 char *f = PY_FORMAT_SIZE_T;
2157 while (*f)
2158 *fmt++ = *f++;
2159 }
2160 *fmt++ = c;
2161 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002162}
2163
Victor Stinner96865452011-03-01 23:44:09 +00002164/* helper for PyUnicode_FromFormatV() */
2165
2166static const char*
2167parse_format_flags(const char *f,
2168 int *p_width, int *p_precision,
2169 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2170{
2171 int width, precision, longflag, longlongflag, size_tflag;
2172
2173 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2174 f++;
2175 width = 0;
2176 while (Py_ISDIGIT((unsigned)*f))
2177 width = (width*10) + *f++ - '0';
2178 precision = 0;
2179 if (*f == '.') {
2180 f++;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 precision = (precision*10) + *f++ - '0';
2183 if (*f == '%') {
2184 /* "%.3%s" => f points to "3" */
2185 f--;
2186 }
2187 }
2188 if (*f == '\0') {
2189 /* bogus format "%.1" => go backward, f points to "1" */
2190 f--;
2191 }
2192 if (p_width != NULL)
2193 *p_width = width;
2194 if (p_precision != NULL)
2195 *p_precision = precision;
2196
2197 /* Handle %ld, %lu, %lld and %llu. */
2198 longflag = 0;
2199 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002200 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002201
2202 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longflag = 1;
2205 ++f;
2206 }
2207#ifdef HAVE_LONG_LONG
2208 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longlongflag = 1;
2211 f += 2;
2212 }
2213#endif
2214 }
2215 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 size_tflag = 1;
2218 ++f;
2219 }
2220 if (p_longflag != NULL)
2221 *p_longflag = longflag;
2222 if (p_longlongflag != NULL)
2223 *p_longlongflag = longlongflag;
2224 if (p_size_tflag != NULL)
2225 *p_size_tflag = size_tflag;
2226 return f;
2227}
2228
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229/* maximum number of characters required for output of %ld. 21 characters
2230 allows for 64-bit integers (in decimal) and an optional sign. */
2231#define MAX_LONG_CHARS 21
2232/* maximum number of characters required for output of %lld.
2233 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2234 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2235#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormatV(const char *format, va_list vargs)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_list count;
2241 Py_ssize_t callcount = 0;
2242 PyObject **callresults = NULL;
2243 PyObject **callresult = NULL;
2244 Py_ssize_t n = 0;
2245 int width = 0;
2246 int precision = 0;
2247 int zeropad;
2248 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002249 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002251 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2253 Py_UCS4 argmaxchar;
2254 Py_ssize_t numbersize = 0;
2255 char *numberresults = NULL;
2256 char *numberresult = NULL;
2257 Py_ssize_t i;
2258 int kind;
2259 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002260
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002261 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002262 /* step 1: count the number of %S/%R/%A/%s format specifications
2263 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2264 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002266 * also estimate a upper bound for all the number formats in the string,
2267 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 for (f = format; *f; f++) {
2270 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002271 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2273 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2274 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2275 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278#ifdef HAVE_LONG_LONG
2279 if (longlongflag) {
2280 if (width < MAX_LONG_LONG_CHARS)
2281 width = MAX_LONG_LONG_CHARS;
2282 }
2283 else
2284#endif
2285 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2286 including sign. Decimal takes the most space. This
2287 isn't enough for octal. If a width is specified we
2288 need more (which we allocate later). */
2289 if (width < MAX_LONG_CHARS)
2290 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291
2292 /* account for the size + '\0' to separate numbers
2293 inside of the numberresults buffer */
2294 numbersize += (width + 1);
2295 }
2296 }
2297 else if ((unsigned char)*f > 127) {
2298 PyErr_Format(PyExc_ValueError,
2299 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2300 "string, got a non-ASCII byte: 0x%02x",
2301 (unsigned char)*f);
2302 return NULL;
2303 }
2304 }
2305 /* step 2: allocate memory for the results of
2306 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2307 if (callcount) {
2308 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2309 if (!callresults) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313 callresult = callresults;
2314 }
2315 /* step 2.5: allocate memory for the results of formating numbers */
2316 if (numbersize) {
2317 numberresults = PyObject_Malloc(numbersize);
2318 if (!numberresults) {
2319 PyErr_NoMemory();
2320 goto fail;
2321 }
2322 numberresult = numberresults;
2323 }
2324
2325 /* step 3: format numbers and figure out how large a buffer we need */
2326 for (f = format; *f; f++) {
2327 if (*f == '%') {
2328 const char* p;
2329 int longflag;
2330 int longlongflag;
2331 int size_tflag;
2332 int numprinted;
2333
2334 p = f;
2335 zeropad = (f[1] == '0');
2336 f = parse_format_flags(f, &width, &precision,
2337 &longflag, &longlongflag, &size_tflag);
2338 switch (*f) {
2339 case 'c':
2340 {
2341 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002342 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 n++;
2344 break;
2345 }
2346 case '%':
2347 n++;
2348 break;
2349 case 'i':
2350 case 'd':
2351 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2352 width, precision, *f);
2353 if (longflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, long));
2356#ifdef HAVE_LONG_LONG
2357 else if (longlongflag)
2358 numprinted = sprintf(numberresult, fmt,
2359 va_arg(count, PY_LONG_LONG));
2360#endif
2361 else if (size_tflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, Py_ssize_t));
2364 else
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, int));
2367 n += numprinted;
2368 /* advance by +1 to skip over the '\0' */
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
2374 break;
2375 case 'u':
2376 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2377 width, precision, 'u');
2378 if (longflag)
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, unsigned long));
2381#ifdef HAVE_LONG_LONG
2382 else if (longlongflag)
2383 numprinted = sprintf(numberresult, fmt,
2384 va_arg(count, unsigned PY_LONG_LONG));
2385#endif
2386 else if (size_tflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, size_t));
2389 else
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned int));
2392 n += numprinted;
2393 numberresult += (numprinted + 1);
2394 assert(*(numberresult - 1) == '\0');
2395 assert(*(numberresult - 2) != '\0');
2396 assert(numprinted >= 0);
2397 assert(numberresult <= numberresults + numbersize);
2398 break;
2399 case 'x':
2400 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2401 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2402 n += numprinted;
2403 numberresult += (numprinted + 1);
2404 assert(*(numberresult - 1) == '\0');
2405 assert(*(numberresult - 2) != '\0');
2406 assert(numprinted >= 0);
2407 assert(numberresult <= numberresults + numbersize);
2408 break;
2409 case 'p':
2410 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2411 /* %p is ill-defined: ensure leading 0x. */
2412 if (numberresult[1] == 'X')
2413 numberresult[1] = 'x';
2414 else if (numberresult[1] != 'x') {
2415 memmove(numberresult + 2, numberresult,
2416 strlen(numberresult) + 1);
2417 numberresult[0] = '0';
2418 numberresult[1] = 'x';
2419 numprinted += 2;
2420 }
2421 n += numprinted;
2422 numberresult += (numprinted + 1);
2423 assert(*(numberresult - 1) == '\0');
2424 assert(*(numberresult - 2) != '\0');
2425 assert(numprinted >= 0);
2426 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 break;
2428 case 's':
2429 {
2430 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002431 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002432 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002433 if (!str)
2434 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 /* since PyUnicode_DecodeUTF8 returns already flexible
2436 unicode objects, there is no need to call ready on them */
2437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002440 /* Remember the str and switch to the next slot */
2441 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 }
2444 case 'U':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002447 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (PyUnicode_READY(obj) == -1)
2449 goto fail;
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'V':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002459 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (PyUnicode_READY(obj) == -1)
2464 goto fail;
2465 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002466 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 *callresult++ = NULL;
2469 }
2470 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002471 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 if (!str_obj)
2473 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002474 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002475 Py_DECREF(str_obj);
2476 goto fail;
2477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = str_obj;
2482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 break;
2484 }
2485 case 'S':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *str;
2489 assert(obj);
2490 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002491 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002493 if (PyUnicode_READY(str) == -1) {
2494 Py_DECREF(str);
2495 goto fail;
2496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002498 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 /* Remember the str and switch to the next slot */
2501 *callresult++ = str;
2502 break;
2503 }
2504 case 'R':
2505 {
2506 PyObject *obj = va_arg(count, PyObject *);
2507 PyObject *repr;
2508 assert(obj);
2509 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002510 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002512 if (PyUnicode_READY(repr) == -1) {
2513 Py_DECREF(repr);
2514 goto fail;
2515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002517 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 /* Remember the repr and switch to the next slot */
2520 *callresult++ = repr;
2521 break;
2522 }
2523 case 'A':
2524 {
2525 PyObject *obj = va_arg(count, PyObject *);
2526 PyObject *ascii;
2527 assert(obj);
2528 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002529 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002531 if (PyUnicode_READY(ascii) == -1) {
2532 Py_DECREF(ascii);
2533 goto fail;
2534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002535 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002536 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 /* Remember the repr and switch to the next slot */
2539 *callresult++ = ascii;
2540 break;
2541 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 default:
2543 /* if we stumble upon an unknown
2544 formatting code, copy the rest of
2545 the format string to the output
2546 string. (we cannot just skip the
2547 code, since there's no way to know
2548 what's in the argument list) */
2549 n += strlen(p);
2550 goto expand;
2551 }
2552 } else
2553 n++;
2554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002555 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 we don't have to resize the string.
2559 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002560 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 if (!string)
2562 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 kind = PyUnicode_KIND(string);
2564 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002570 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002571
2572 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2574 /* checking for == because the last argument could be a empty
2575 string, which causes i to point to end, the assert at the end of
2576 the loop */
2577 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002578
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 switch (*f) {
2580 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002581 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 const int ordinal = va_arg(vargs, int);
2583 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002585 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002586 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 case 'p':
2591 /* unused, since we already have the result */
2592 if (*f == 'p')
2593 (void) va_arg(vargs, void *);
2594 else
2595 (void) va_arg(vargs, int);
2596 /* extract the result from numberresults and append. */
2597 for (; *numberresult; ++i, ++numberresult)
2598 PyUnicode_WRITE(kind, data, i, *numberresult);
2599 /* skip over the separating '\0' */
2600 assert(*numberresult == '\0');
2601 numberresult++;
2602 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 break;
2604 case 's':
2605 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002606 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002608 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 size = PyUnicode_GET_LENGTH(*callresult);
2610 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002611 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002613 /* We're done with the unicode()/repr() => forget it */
2614 Py_DECREF(*callresult);
2615 /* switch to next unicode()/repr() result */
2616 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002617 break;
2618 }
2619 case 'U':
2620 {
2621 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 Py_ssize_t size;
2623 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2624 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 break;
2628 }
2629 case 'V':
2630 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 size = PyUnicode_GET_LENGTH(obj);
2636 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002637 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 size = PyUnicode_GET_LENGTH(*callresult);
2641 assert(PyUnicode_KIND(*callresult) <=
2642 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002643 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002645 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002647 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 break;
2649 }
2650 case 'S':
2651 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002652 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002654 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 /* unused, since we already have the result */
2656 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002658 copy_characters(string, i, *callresult, 0, size);
2659 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 /* We're done with the unicode()/repr() => forget it */
2661 Py_DECREF(*callresult);
2662 /* switch to next unicode()/repr() result */
2663 ++callresult;
2664 break;
2665 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 break;
2669 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 for (; *p; ++p, ++i)
2671 PyUnicode_WRITE(kind, data, i, *p);
2672 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 goto end;
2674 }
Victor Stinner1205f272010-09-11 00:54:47 +00002675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 else {
2677 assert(i < PyUnicode_GET_LENGTH(string));
2678 PyUnicode_WRITE(kind, data, i++, *f);
2679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002682
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 if (callresults)
2685 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002688 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 if (callresults) {
2691 PyObject **callresult2 = callresults;
2692 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002693 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 ++callresult2;
2695 }
2696 PyObject_Free(callresults);
2697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 if (numberresults)
2699 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701}
2702
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703PyObject *
2704PyUnicode_FromFormat(const char *format, ...)
2705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 PyObject* ret;
2707 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
2709#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002711#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 ret = PyUnicode_FromFormatV(format, vargs);
2715 va_end(vargs);
2716 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717}
2718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719#ifdef HAVE_WCHAR_H
2720
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2722 convert a Unicode object to a wide character string.
2723
Victor Stinnerd88d9832011-09-06 02:00:05 +02002724 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725 character) required to convert the unicode object. Ignore size argument.
2726
Victor Stinnerd88d9832011-09-06 02:00:05 +02002727 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002728 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002729 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002731unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 wchar_t *w,
2733 Py_ssize_t size)
2734{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002735 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 const wchar_t *wstr;
2737
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002738 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 if (wstr == NULL)
2740 return -1;
2741
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743 if (size > res)
2744 size = res + 1;
2745 else
2746 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 return res;
2749 }
2750 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002752}
2753
2754Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002755PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 wchar_t *w,
2757 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758{
2759 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 PyErr_BadInternalCall();
2761 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002763 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764}
2765
Victor Stinner137c34c2010-09-29 10:25:54 +00002766wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002767PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002768 Py_ssize_t *size)
2769{
2770 wchar_t* buffer;
2771 Py_ssize_t buflen;
2772
2773 if (unicode == NULL) {
2774 PyErr_BadInternalCall();
2775 return NULL;
2776 }
2777
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002778 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 if (buflen == -1)
2780 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 PyErr_NoMemory();
2783 return NULL;
2784 }
2785
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2787 if (buffer == NULL) {
2788 PyErr_NoMemory();
2789 return NULL;
2790 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002791 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 if (buflen == -1)
2793 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794 if (size != NULL)
2795 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002796 return buffer;
2797}
2798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800
Alexander Belopolsky40018472011-02-26 01:02:56 +00002801PyObject *
2802PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002805 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 PyErr_SetString(PyExc_ValueError,
2807 "chr() arg not in range(0x110000)");
2808 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (ordinal < 256)
2812 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 v = PyUnicode_New(1, ordinal);
2815 if (v == NULL)
2816 return NULL;
2817 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002818 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002820}
2821
Alexander Belopolsky40018472011-02-26 01:02:56 +00002822PyObject *
2823PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002828 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002829 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 Py_INCREF(obj);
2831 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002832 }
2833 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 /* For a Unicode subtype that's not a Unicode object,
2835 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002836 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002837 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002838 PyErr_Format(PyExc_TypeError,
2839 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002840 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002841 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002842}
2843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002846 const char *encoding,
2847 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002848{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002849 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 PyErr_BadInternalCall();
2854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002856
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 /* Decoding bytes objects is the most common case and should be fast */
2858 if (PyBytes_Check(obj)) {
2859 if (PyBytes_GET_SIZE(obj) == 0) {
2860 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002861 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 }
2863 else {
2864 v = PyUnicode_Decode(
2865 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2866 encoding, errors);
2867 }
2868 return v;
2869 }
2870
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002871 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002872 PyErr_SetString(PyExc_TypeError,
2873 "decoding str is not supported");
2874 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002875 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002876
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2878 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2879 PyErr_Format(PyExc_TypeError,
2880 "coercing to str: need bytes, bytearray "
2881 "or buffer-like object, %.80s found",
2882 Py_TYPE(obj)->tp_name);
2883 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002884 }
Tim Petersced69f82003-09-16 20:30:58 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002887 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002888 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 }
Tim Petersced69f82003-09-16 20:30:58 +00002890 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002892
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002894 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895}
2896
Victor Stinner600d3be2010-06-10 12:00:55 +00002897/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002898 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2899 1 on success. */
2900static int
2901normalize_encoding(const char *encoding,
2902 char *lower,
2903 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002905 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002906 char *l;
2907 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002909 if (encoding == NULL) {
2910 strcpy(lower, "utf-8");
2911 return 1;
2912 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002913 e = encoding;
2914 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002915 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002916 while (*e) {
2917 if (l == l_end)
2918 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002919 if (Py_ISUPPER(*e)) {
2920 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002921 }
2922 else if (*e == '_') {
2923 *l++ = '-';
2924 e++;
2925 }
2926 else {
2927 *l++ = *e++;
2928 }
2929 }
2930 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002931 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002932}
2933
Alexander Belopolsky40018472011-02-26 01:02:56 +00002934PyObject *
2935PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002936 Py_ssize_t size,
2937 const char *encoding,
2938 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002939{
2940 PyObject *buffer = NULL, *unicode;
2941 Py_buffer info;
2942 char lower[11]; /* Enough for any encoding shortcut */
2943
Fred Drakee4315f52000-05-09 19:53:39 +00002944 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002945 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002946 if ((strcmp(lower, "utf-8") == 0) ||
2947 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002948 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002949 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002950 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002951 (strcmp(lower, "iso-8859-1") == 0))
2952 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002953#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002954 else if (strcmp(lower, "mbcs") == 0)
2955 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002956#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002957 else if (strcmp(lower, "ascii") == 0)
2958 return PyUnicode_DecodeASCII(s, size, errors);
2959 else if (strcmp(lower, "utf-16") == 0)
2960 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2961 else if (strcmp(lower, "utf-32") == 0)
2962 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964
2965 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002966 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002967 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002968 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002969 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 if (buffer == NULL)
2971 goto onError;
2972 unicode = PyCodec_Decode(buffer, encoding, errors);
2973 if (unicode == NULL)
2974 goto onError;
2975 if (!PyUnicode_Check(unicode)) {
2976 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002977 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002978 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_DECREF(unicode);
2980 goto onError;
2981 }
2982 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002983 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002984
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 Py_XDECREF(buffer);
2987 return NULL;
2988}
2989
Alexander Belopolsky40018472011-02-26 01:02:56 +00002990PyObject *
2991PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002992 const char *encoding,
2993 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002994{
2995 PyObject *v;
2996
2997 if (!PyUnicode_Check(unicode)) {
2998 PyErr_BadArgument();
2999 goto onError;
3000 }
3001
3002 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003004
3005 /* Decode via the codec registry */
3006 v = PyCodec_Decode(unicode, encoding, errors);
3007 if (v == NULL)
3008 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003009 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003010
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 const char *encoding,
3018 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003019{
3020 PyObject *v;
3021
3022 if (!PyUnicode_Check(unicode)) {
3023 PyErr_BadArgument();
3024 goto onError;
3025 }
3026
3027 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003029
3030 /* Decode via the codec registry */
3031 v = PyCodec_Decode(unicode, encoding, errors);
3032 if (v == NULL)
3033 goto onError;
3034 if (!PyUnicode_Check(v)) {
3035 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003036 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037 Py_TYPE(v)->tp_name);
3038 Py_DECREF(v);
3039 goto onError;
3040 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003041 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003042
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044 return NULL;
3045}
3046
Alexander Belopolsky40018472011-02-26 01:02:56 +00003047PyObject *
3048PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003049 Py_ssize_t size,
3050 const char *encoding,
3051 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052{
3053 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003054
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 unicode = PyUnicode_FromUnicode(s, size);
3056 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3059 Py_DECREF(unicode);
3060 return v;
3061}
3062
Alexander Belopolsky40018472011-02-26 01:02:56 +00003063PyObject *
3064PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003065 const char *encoding,
3066 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003067{
3068 PyObject *v;
3069
3070 if (!PyUnicode_Check(unicode)) {
3071 PyErr_BadArgument();
3072 goto onError;
3073 }
3074
3075 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
3078 /* Encode via the codec registry */
3079 v = PyCodec_Encode(unicode, encoding, errors);
3080 if (v == NULL)
3081 goto onError;
3082 return v;
3083
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003085 return NULL;
3086}
3087
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003088static size_t
3089wcstombs_errorpos(const wchar_t *wstr)
3090{
3091 size_t len;
3092#if SIZEOF_WCHAR_T == 2
3093 wchar_t buf[3];
3094#else
3095 wchar_t buf[2];
3096#endif
3097 char outbuf[MB_LEN_MAX];
3098 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003099
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003100#if SIZEOF_WCHAR_T == 2
3101 buf[2] = 0;
3102#else
3103 buf[1] = 0;
3104#endif
3105 start = wstr;
3106 while (*wstr != L'\0')
3107 {
3108 previous = wstr;
3109#if SIZEOF_WCHAR_T == 2
3110 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3111 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3112 {
3113 buf[0] = wstr[0];
3114 buf[1] = wstr[1];
3115 wstr += 2;
3116 }
3117 else {
3118 buf[0] = *wstr;
3119 buf[1] = 0;
3120 wstr++;
3121 }
3122#else
3123 buf[0] = *wstr;
3124 wstr++;
3125#endif
3126 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003127 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003128 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003129 }
3130
3131 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003132 return 0;
3133}
3134
Victor Stinner1b579672011-12-17 05:47:23 +01003135static int
3136locale_error_handler(const char *errors, int *surrogateescape)
3137{
3138 if (errors == NULL) {
3139 *surrogateescape = 0;
3140 return 0;
3141 }
3142
3143 if (strcmp(errors, "strict") == 0) {
3144 *surrogateescape = 0;
3145 return 0;
3146 }
3147 if (strcmp(errors, "surrogateescape") == 0) {
3148 *surrogateescape = 1;
3149 return 0;
3150 }
3151 PyErr_Format(PyExc_ValueError,
3152 "only 'strict' and 'surrogateescape' error handlers "
3153 "are supported, not '%s'",
3154 errors);
3155 return -1;
3156}
3157
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003158PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003159PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160{
3161 Py_ssize_t wlen, wlen2;
3162 wchar_t *wstr;
3163 PyObject *bytes = NULL;
3164 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003165 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003166 PyObject *exc;
3167 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003168 int surrogateescape;
3169
3170 if (locale_error_handler(errors, &surrogateescape) < 0)
3171 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172
3173 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3174 if (wstr == NULL)
3175 return NULL;
3176
3177 wlen2 = wcslen(wstr);
3178 if (wlen2 != wlen) {
3179 PyMem_Free(wstr);
3180 PyErr_SetString(PyExc_TypeError, "embedded null character");
3181 return NULL;
3182 }
3183
3184 if (surrogateescape) {
3185 /* locale encoding with surrogateescape */
3186 char *str;
3187
3188 str = _Py_wchar2char(wstr, &error_pos);
3189 if (str == NULL) {
3190 if (error_pos == (size_t)-1) {
3191 PyErr_NoMemory();
3192 PyMem_Free(wstr);
3193 return NULL;
3194 }
3195 else {
3196 goto encode_error;
3197 }
3198 }
3199 PyMem_Free(wstr);
3200
3201 bytes = PyBytes_FromString(str);
3202 PyMem_Free(str);
3203 }
3204 else {
3205 size_t len, len2;
3206
3207 len = wcstombs(NULL, wstr, 0);
3208 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003209 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003210 goto encode_error;
3211 }
3212
3213 bytes = PyBytes_FromStringAndSize(NULL, len);
3214 if (bytes == NULL) {
3215 PyMem_Free(wstr);
3216 return NULL;
3217 }
3218
3219 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3220 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003221 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003222 goto encode_error;
3223 }
3224 PyMem_Free(wstr);
3225 }
3226 return bytes;
3227
3228encode_error:
3229 errmsg = strerror(errno);
3230 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003231
3232 if (error_pos == (size_t)-1)
3233 error_pos = wcstombs_errorpos(wstr);
3234
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235 PyMem_Free(wstr);
3236 Py_XDECREF(bytes);
3237
Victor Stinner2f197072011-12-17 07:08:30 +01003238 if (errmsg != NULL) {
3239 size_t errlen;
3240 wstr = _Py_char2wchar(errmsg, &errlen);
3241 if (wstr != NULL) {
3242 reason = PyUnicode_FromWideChar(wstr, errlen);
3243 PyMem_Free(wstr);
3244 } else
3245 errmsg = NULL;
3246 }
3247 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003248 reason = PyUnicode_FromString(
3249 "wcstombs() encountered an unencodable "
3250 "wide character");
3251 if (reason == NULL)
3252 return NULL;
3253
3254 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3255 "locale", unicode,
3256 (Py_ssize_t)error_pos,
3257 (Py_ssize_t)(error_pos+1),
3258 reason);
3259 Py_DECREF(reason);
3260 if (exc != NULL) {
3261 PyCodec_StrictErrors(exc);
3262 Py_XDECREF(exc);
3263 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003264 return NULL;
3265}
3266
Victor Stinnerad158722010-10-27 00:25:46 +00003267PyObject *
3268PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003269{
Victor Stinner99b95382011-07-04 14:23:54 +02003270#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003271 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003272#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003273 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003274#else
Victor Stinner793b5312011-04-27 00:24:21 +02003275 PyInterpreterState *interp = PyThreadState_GET()->interp;
3276 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3277 cannot use it to encode and decode filenames before it is loaded. Load
3278 the Python codec requires to encode at least its own filename. Use the C
3279 version of the locale codec until the codec registry is initialized and
3280 the Python codec is loaded.
3281
3282 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3283 cannot only rely on it: check also interp->fscodec_initialized for
3284 subinterpreters. */
3285 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003286 return PyUnicode_AsEncodedString(unicode,
3287 Py_FileSystemDefaultEncoding,
3288 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003289 }
3290 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003291 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003292 }
Victor Stinnerad158722010-10-27 00:25:46 +00003293#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003294}
3295
Alexander Belopolsky40018472011-02-26 01:02:56 +00003296PyObject *
3297PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003298 const char *encoding,
3299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300{
3301 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003302 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003303
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 if (!PyUnicode_Check(unicode)) {
3305 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 }
Fred Drakee4315f52000-05-09 19:53:39 +00003308
Fred Drakee4315f52000-05-09 19:53:39 +00003309 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003310 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003311 if ((strcmp(lower, "utf-8") == 0) ||
3312 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003313 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003314 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003315 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003316 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003318 }
Victor Stinner37296e82010-06-10 13:36:23 +00003319 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003320 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003321 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003323#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003324 else if (strcmp(lower, "mbcs") == 0)
3325 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003326#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003327 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003328 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330
3331 /* Encode via the codec registry */
3332 v = PyCodec_Encode(unicode, encoding, errors);
3333 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003334 return NULL;
3335
3336 /* The normal path */
3337 if (PyBytes_Check(v))
3338 return v;
3339
3340 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003341 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003342 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003343 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003344
3345 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3346 "encoder %s returned bytearray instead of bytes",
3347 encoding);
3348 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003349 Py_DECREF(v);
3350 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003351 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003352
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003353 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3354 Py_DECREF(v);
3355 return b;
3356 }
3357
3358 PyErr_Format(PyExc_TypeError,
3359 "encoder did not return a bytes object (type=%.400s)",
3360 Py_TYPE(v)->tp_name);
3361 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003362 return NULL;
3363}
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 const char *encoding,
3368 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003369{
3370 PyObject *v;
3371
3372 if (!PyUnicode_Check(unicode)) {
3373 PyErr_BadArgument();
3374 goto onError;
3375 }
3376
3377 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003379
3380 /* Encode via the codec registry */
3381 v = PyCodec_Encode(unicode, encoding, errors);
3382 if (v == NULL)
3383 goto onError;
3384 if (!PyUnicode_Check(v)) {
3385 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003386 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003387 Py_TYPE(v)->tp_name);
3388 Py_DECREF(v);
3389 goto onError;
3390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003392
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 return NULL;
3395}
3396
Victor Stinner2f197072011-12-17 07:08:30 +01003397static size_t
3398mbstowcs_errorpos(const char *str, size_t len)
3399{
3400#ifdef HAVE_MBRTOWC
3401 const char *start = str;
3402 mbstate_t mbs;
3403 size_t converted;
3404 wchar_t ch;
3405
3406 memset(&mbs, 0, sizeof mbs);
3407 while (len)
3408 {
3409 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3410 if (converted == 0)
3411 /* Reached end of string */
3412 break;
3413 if (converted == (size_t)-1 || converted == (size_t)-2) {
3414 /* Conversion error or incomplete character */
3415 return str - start;
3416 }
3417 else {
3418 str += converted;
3419 len -= converted;
3420 }
3421 }
3422 /* failed to find the undecodable byte sequence */
3423 return 0;
3424#endif
3425 return 0;
3426}
3427
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003428PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003429PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003430 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003431{
3432 wchar_t smallbuf[256];
3433 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3434 wchar_t *wstr;
3435 size_t wlen, wlen2;
3436 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003437 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003438 size_t error_pos;
3439 char *errmsg;
3440 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003441
3442 if (locale_error_handler(errors, &surrogateescape) < 0)
3443 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003444
3445 if (str[len] != '\0' || len != strlen(str)) {
3446 PyErr_SetString(PyExc_TypeError, "embedded null character");
3447 return NULL;
3448 }
3449
3450 if (surrogateescape)
3451 {
3452 wstr = _Py_char2wchar(str, &wlen);
3453 if (wstr == NULL) {
3454 if (wlen == (size_t)-1)
3455 PyErr_NoMemory();
3456 else
3457 PyErr_SetFromErrno(PyExc_OSError);
3458 return NULL;
3459 }
3460
3461 unicode = PyUnicode_FromWideChar(wstr, wlen);
3462 PyMem_Free(wstr);
3463 }
3464 else {
3465#ifndef HAVE_BROKEN_MBSTOWCS
3466 wlen = mbstowcs(NULL, str, 0);
3467#else
3468 wlen = len;
3469#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003470 if (wlen == (size_t)-1)
3471 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003472 if (wlen+1 <= smallbuf_len) {
3473 wstr = smallbuf;
3474 }
3475 else {
3476 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3477 return PyErr_NoMemory();
3478
3479 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3480 if (!wstr)
3481 return PyErr_NoMemory();
3482 }
3483
3484 /* This shouldn't fail now */
3485 wlen2 = mbstowcs(wstr, str, wlen+1);
3486 if (wlen2 == (size_t)-1) {
3487 if (wstr != smallbuf)
3488 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003489 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003490 }
3491#ifdef HAVE_BROKEN_MBSTOWCS
3492 assert(wlen2 == wlen);
3493#endif
3494 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3495 if (wstr != smallbuf)
3496 PyMem_Free(wstr);
3497 }
3498 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003499
3500decode_error:
3501 errmsg = strerror(errno);
3502 assert(errmsg != NULL);
3503
3504 error_pos = mbstowcs_errorpos(str, len);
3505 if (errmsg != NULL) {
3506 size_t errlen;
3507 wstr = _Py_char2wchar(errmsg, &errlen);
3508 if (wstr != NULL) {
3509 reason = PyUnicode_FromWideChar(wstr, errlen);
3510 PyMem_Free(wstr);
3511 } else
3512 errmsg = NULL;
3513 }
3514 if (errmsg == NULL)
3515 reason = PyUnicode_FromString(
3516 "mbstowcs() encountered an invalid multibyte sequence");
3517 if (reason == NULL)
3518 return NULL;
3519
3520 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3521 "locale", str, len,
3522 (Py_ssize_t)error_pos,
3523 (Py_ssize_t)(error_pos+1),
3524 reason);
3525 Py_DECREF(reason);
3526 if (exc != NULL) {
3527 PyCodec_StrictErrors(exc);
3528 Py_XDECREF(exc);
3529 }
3530 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531}
3532
3533PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003534PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003535{
3536 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003537 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003538}
3539
3540
3541PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003542PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003543 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003544 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3545}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003546
Christian Heimes5894ba72007-11-04 11:43:14 +00003547PyObject*
3548PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3549{
Victor Stinner99b95382011-07-04 14:23:54 +02003550#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003551 return PyUnicode_DecodeMBCS(s, size, NULL);
3552#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003553 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003554#else
Victor Stinner793b5312011-04-27 00:24:21 +02003555 PyInterpreterState *interp = PyThreadState_GET()->interp;
3556 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3557 cannot use it to encode and decode filenames before it is loaded. Load
3558 the Python codec requires to encode at least its own filename. Use the C
3559 version of the locale codec until the codec registry is initialized and
3560 the Python codec is loaded.
3561
3562 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3563 cannot only rely on it: check also interp->fscodec_initialized for
3564 subinterpreters. */
3565 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003566 return PyUnicode_Decode(s, size,
3567 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003568 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003569 }
3570 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003571 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572 }
Victor Stinnerad158722010-10-27 00:25:46 +00003573#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003574}
3575
Martin v. Löwis011e8422009-05-05 04:43:17 +00003576
3577int
3578PyUnicode_FSConverter(PyObject* arg, void* addr)
3579{
3580 PyObject *output = NULL;
3581 Py_ssize_t size;
3582 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003583 if (arg == NULL) {
3584 Py_DECREF(*(PyObject**)addr);
3585 return 1;
3586 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003587 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003588 output = arg;
3589 Py_INCREF(output);
3590 }
3591 else {
3592 arg = PyUnicode_FromObject(arg);
3593 if (!arg)
3594 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003595 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003596 Py_DECREF(arg);
3597 if (!output)
3598 return 0;
3599 if (!PyBytes_Check(output)) {
3600 Py_DECREF(output);
3601 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3602 return 0;
3603 }
3604 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003605 size = PyBytes_GET_SIZE(output);
3606 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003607 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003608 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003609 Py_DECREF(output);
3610 return 0;
3611 }
3612 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003613 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003614}
3615
3616
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003617int
3618PyUnicode_FSDecoder(PyObject* arg, void* addr)
3619{
3620 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003621 if (arg == NULL) {
3622 Py_DECREF(*(PyObject**)addr);
3623 return 1;
3624 }
3625 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003626 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003627 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003628 output = arg;
3629 Py_INCREF(output);
3630 }
3631 else {
3632 arg = PyBytes_FromObject(arg);
3633 if (!arg)
3634 return 0;
3635 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3636 PyBytes_GET_SIZE(arg));
3637 Py_DECREF(arg);
3638 if (!output)
3639 return 0;
3640 if (!PyUnicode_Check(output)) {
3641 Py_DECREF(output);
3642 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3643 return 0;
3644 }
3645 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003646 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003647 Py_DECREF(output);
3648 return 0;
3649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003650 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003651 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003652 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3653 Py_DECREF(output);
3654 return 0;
3655 }
3656 *(PyObject**)addr = output;
3657 return Py_CLEANUP_SUPPORTED;
3658}
3659
3660
Martin v. Löwis5b222132007-06-10 09:51:05 +00003661char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003662PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003663{
Christian Heimesf3863112007-11-22 07:46:41 +00003664 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003666 if (!PyUnicode_Check(unicode)) {
3667 PyErr_BadArgument();
3668 return NULL;
3669 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003670 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003671 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003672
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003673 if (PyUnicode_UTF8(unicode) == NULL) {
3674 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003675 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3676 if (bytes == NULL)
3677 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003678 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3679 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680 Py_DECREF(bytes);
3681 return NULL;
3682 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003683 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3684 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3685 PyBytes_AS_STRING(bytes),
3686 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687 Py_DECREF(bytes);
3688 }
3689
3690 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003691 *psize = PyUnicode_UTF8_LENGTH(unicode);
3692 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003693}
3694
3695char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003696PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003698 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3699}
3700
3701#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003702static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003703#endif
3704
3705
3706Py_UNICODE *
3707PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 const unsigned char *one_byte;
3710#if SIZEOF_WCHAR_T == 4
3711 const Py_UCS2 *two_bytes;
3712#else
3713 const Py_UCS4 *four_bytes;
3714 const Py_UCS4 *ucs4_end;
3715 Py_ssize_t num_surrogates;
3716#endif
3717 wchar_t *w;
3718 wchar_t *wchar_end;
3719
3720 if (!PyUnicode_Check(unicode)) {
3721 PyErr_BadArgument();
3722 return NULL;
3723 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003724 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003725 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003726 assert(_PyUnicode_KIND(unicode) != 0);
3727 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728
3729#ifdef Py_DEBUG
3730 ++unicode_as_unicode_calls;
3731#endif
3732
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003733 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003735 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3736 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 num_surrogates = 0;
3738
3739 for (; four_bytes < ucs4_end; ++four_bytes) {
3740 if (*four_bytes > 0xFFFF)
3741 ++num_surrogates;
3742 }
3743
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003744 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3745 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3746 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747 PyErr_NoMemory();
3748 return NULL;
3749 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 w = _PyUnicode_WSTR(unicode);
3753 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3754 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3756 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003757 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003759 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3760 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 }
3762 else
3763 *w = *four_bytes;
3764
3765 if (w > wchar_end) {
3766 assert(0 && "Miscalculated string end");
3767 }
3768 }
3769 *w = 0;
3770#else
3771 /* sizeof(wchar_t) == 4 */
3772 Py_FatalError("Impossible unicode object state, wstr and str "
3773 "should share memory already.");
3774 return NULL;
3775#endif
3776 }
3777 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003778 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3779 (_PyUnicode_LENGTH(unicode) + 1));
3780 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 PyErr_NoMemory();
3782 return NULL;
3783 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3785 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3786 w = _PyUnicode_WSTR(unicode);
3787 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003789 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3790 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 for (; w < wchar_end; ++one_byte, ++w)
3792 *w = *one_byte;
3793 /* null-terminate the wstr */
3794 *w = 0;
3795 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003796 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003798 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799 for (; w < wchar_end; ++two_bytes, ++w)
3800 *w = *two_bytes;
3801 /* null-terminate the wstr */
3802 *w = 0;
3803#else
3804 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003805 PyObject_FREE(_PyUnicode_WSTR(unicode));
3806 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807 Py_FatalError("Impossible unicode object state, wstr "
3808 "and str should share memory already.");
3809 return NULL;
3810#endif
3811 }
3812 else {
3813 assert(0 && "This should never happen.");
3814 }
3815 }
3816 }
3817 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 *size = PyUnicode_WSTR_LENGTH(unicode);
3819 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003820}
3821
Alexander Belopolsky40018472011-02-26 01:02:56 +00003822Py_UNICODE *
3823PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826}
3827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828
Alexander Belopolsky40018472011-02-26 01:02:56 +00003829Py_ssize_t
3830PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831{
3832 if (!PyUnicode_Check(unicode)) {
3833 PyErr_BadArgument();
3834 goto onError;
3835 }
3836 return PyUnicode_GET_SIZE(unicode);
3837
Benjamin Peterson29060642009-01-31 22:14:21 +00003838 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 return -1;
3840}
3841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842Py_ssize_t
3843PyUnicode_GetLength(PyObject *unicode)
3844{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003845 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846 PyErr_BadArgument();
3847 return -1;
3848 }
3849
3850 return PyUnicode_GET_LENGTH(unicode);
3851}
3852
3853Py_UCS4
3854PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3855{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003856 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3857 PyErr_BadArgument();
3858 return (Py_UCS4)-1;
3859 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003860 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003861 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862 return (Py_UCS4)-1;
3863 }
3864 return PyUnicode_READ_CHAR(unicode, index);
3865}
3866
3867int
3868PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3869{
3870 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003871 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872 return -1;
3873 }
Victor Stinner488fa492011-12-12 00:01:39 +01003874 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003875 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003876 PyErr_SetString(PyExc_IndexError, "string index out of range");
3877 return -1;
3878 }
Victor Stinner488fa492011-12-12 00:01:39 +01003879 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003880 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3882 index, ch);
3883 return 0;
3884}
3885
Alexander Belopolsky40018472011-02-26 01:02:56 +00003886const char *
3887PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003888{
Victor Stinner42cb4622010-09-01 19:39:01 +00003889 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003890}
3891
Victor Stinner554f3f02010-06-16 23:33:54 +00003892/* create or adjust a UnicodeDecodeError */
3893static void
3894make_decode_exception(PyObject **exceptionObject,
3895 const char *encoding,
3896 const char *input, Py_ssize_t length,
3897 Py_ssize_t startpos, Py_ssize_t endpos,
3898 const char *reason)
3899{
3900 if (*exceptionObject == NULL) {
3901 *exceptionObject = PyUnicodeDecodeError_Create(
3902 encoding, input, length, startpos, endpos, reason);
3903 }
3904 else {
3905 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3906 goto onError;
3907 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3908 goto onError;
3909 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3910 goto onError;
3911 }
3912 return;
3913
3914onError:
3915 Py_DECREF(*exceptionObject);
3916 *exceptionObject = NULL;
3917}
3918
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919/* error handling callback helper:
3920 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003921 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 and adjust various state variables.
3923 return 0 on success, -1 on error
3924*/
3925
Alexander Belopolsky40018472011-02-26 01:02:56 +00003926static int
3927unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003928 const char *encoding, const char *reason,
3929 const char **input, const char **inend, Py_ssize_t *startinpos,
3930 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003931 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003933 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934
3935 PyObject *restuple = NULL;
3936 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003937 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003938 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003939 Py_ssize_t requiredsize;
3940 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003941 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 int res = -1;
3943
Victor Stinner596a6c42011-11-09 00:02:18 +01003944 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3945 outsize = PyUnicode_GET_LENGTH(*output);
3946 else
3947 outsize = _PyUnicode_WSTR_LENGTH(*output);
3948
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 *errorHandler = PyCodec_LookupError(errors);
3951 if (*errorHandler == NULL)
3952 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 }
3954
Victor Stinner554f3f02010-06-16 23:33:54 +00003955 make_decode_exception(exceptionObject,
3956 encoding,
3957 *input, *inend - *input,
3958 *startinpos, *endinpos,
3959 reason);
3960 if (*exceptionObject == NULL)
3961 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962
3963 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3964 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003967 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003968 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 }
3970 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003972 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003973 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003974
3975 /* Copy back the bytes variables, which might have been modified by the
3976 callback */
3977 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3978 if (!inputobj)
3979 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003980 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003982 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003983 *input = PyBytes_AS_STRING(inputobj);
3984 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003985 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003986 /* we can DECREF safely, as the exception has another reference,
3987 so the object won't go away. */
3988 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003989
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003992 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3994 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003995 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996
Victor Stinner596a6c42011-11-09 00:02:18 +01003997 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3998 /* need more space? (at least enough for what we
3999 have+the replacement+the rest of the string (starting
4000 at the new input position), so we won't have to check space
4001 when there are no errors in the rest of the string) */
4002 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4003 requiredsize = *outpos + replen + insize-newpos;
4004 if (requiredsize > outsize) {
4005 if (requiredsize<2*outsize)
4006 requiredsize = 2*outsize;
4007 if (unicode_resize(output, requiredsize) < 0)
4008 goto onError;
4009 }
4010 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004012 copy_characters(*output, *outpos, repunicode, 0, replen);
4013 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004015 else {
4016 wchar_t *repwstr;
4017 Py_ssize_t repwlen;
4018 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4019 if (repwstr == NULL)
4020 goto onError;
4021 /* need more space? (at least enough for what we
4022 have+the replacement+the rest of the string (starting
4023 at the new input position), so we won't have to check space
4024 when there are no errors in the rest of the string) */
4025 requiredsize = *outpos + repwlen + insize-newpos;
4026 if (requiredsize > outsize) {
4027 if (requiredsize < 2*outsize)
4028 requiredsize = 2*outsize;
4029 if (unicode_resize(output, requiredsize) < 0)
4030 goto onError;
4031 }
4032 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4033 *outpos += repwlen;
4034 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004035 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004036 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 /* we made it! */
4039 res = 0;
4040
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 Py_XDECREF(restuple);
4043 return res;
4044}
4045
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004046/* --- UTF-7 Codec -------------------------------------------------------- */
4047
Antoine Pitrou244651a2009-05-04 18:56:13 +00004048/* See RFC2152 for details. We encode conservatively and decode liberally. */
4049
4050/* Three simple macros defining base-64. */
4051
4052/* Is c a base-64 character? */
4053
4054#define IS_BASE64(c) \
4055 (((c) >= 'A' && (c) <= 'Z') || \
4056 ((c) >= 'a' && (c) <= 'z') || \
4057 ((c) >= '0' && (c) <= '9') || \
4058 (c) == '+' || (c) == '/')
4059
4060/* given that c is a base-64 character, what is its base-64 value? */
4061
4062#define FROM_BASE64(c) \
4063 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4064 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4065 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4066 (c) == '+' ? 62 : 63)
4067
4068/* What is the base-64 character of the bottom 6 bits of n? */
4069
4070#define TO_BASE64(n) \
4071 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4072
4073/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4074 * decoded as itself. We are permissive on decoding; the only ASCII
4075 * byte not decoding to itself is the + which begins a base64
4076 * string. */
4077
4078#define DECODE_DIRECT(c) \
4079 ((c) <= 127 && (c) != '+')
4080
4081/* The UTF-7 encoder treats ASCII characters differently according to
4082 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4083 * the above). See RFC2152. This array identifies these different
4084 * sets:
4085 * 0 : "Set D"
4086 * alphanumeric and '(),-./:?
4087 * 1 : "Set O"
4088 * !"#$%&*;<=>@[]^_`{|}
4089 * 2 : "whitespace"
4090 * ht nl cr sp
4091 * 3 : special (must be base64 encoded)
4092 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4093 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004094
Tim Petersced69f82003-09-16 20:30:58 +00004095static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004096char utf7_category[128] = {
4097/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4098 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4099/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4100 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4101/* sp ! " # $ % & ' ( ) * + , - . / */
4102 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4103/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4105/* @ A B C D E F G H I J K L M N O */
4106 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4107/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4109/* ` a b c d e f g h i j k l m n o */
4110 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4111/* p q r s t u v w x y z { | } ~ del */
4112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004113};
4114
Antoine Pitrou244651a2009-05-04 18:56:13 +00004115/* ENCODE_DIRECT: this character should be encoded as itself. The
4116 * answer depends on whether we are encoding set O as itself, and also
4117 * on whether we are encoding whitespace as itself. RFC2152 makes it
4118 * clear that the answers to these questions vary between
4119 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004120
Antoine Pitrou244651a2009-05-04 18:56:13 +00004121#define ENCODE_DIRECT(c, directO, directWS) \
4122 ((c) < 128 && (c) > 0 && \
4123 ((utf7_category[(c)] == 0) || \
4124 (directWS && (utf7_category[(c)] == 2)) || \
4125 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004126
Alexander Belopolsky40018472011-02-26 01:02:56 +00004127PyObject *
4128PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004129 Py_ssize_t size,
4130 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004131{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004132 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4133}
4134
Antoine Pitrou244651a2009-05-04 18:56:13 +00004135/* The decoder. The only state we preserve is our read position,
4136 * i.e. how many characters we have consumed. So if we end in the
4137 * middle of a shift sequence we have to back off the read position
4138 * and the output to the beginning of the sequence, otherwise we lose
4139 * all the shift state (seen bits, number of bits seen, high
4140 * surrogate). */
4141
Alexander Belopolsky40018472011-02-26 01:02:56 +00004142PyObject *
4143PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004144 Py_ssize_t size,
4145 const char *errors,
4146 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004147{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004149 Py_ssize_t startinpos;
4150 Py_ssize_t endinpos;
4151 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004152 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004153 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004154 const char *errmsg = "";
4155 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004156 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004157 unsigned int base64bits = 0;
4158 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004159 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 PyObject *errorHandler = NULL;
4161 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004162
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004163 /* Start off assuming it's all ASCII. Widen later as necessary. */
4164 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165 if (!unicode)
4166 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004167 if (size == 0) {
4168 if (consumed)
4169 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004170 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004171 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004172
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004173 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004174 e = s + size;
4175
4176 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004177 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004179 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004180
Antoine Pitrou244651a2009-05-04 18:56:13 +00004181 if (inShift) { /* in a base-64 section */
4182 if (IS_BASE64(ch)) { /* consume a base-64 character */
4183 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4184 base64bits += 6;
4185 s++;
4186 if (base64bits >= 16) {
4187 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004188 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004189 base64bits -= 16;
4190 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4191 if (surrogate) {
4192 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004193 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4194 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004195 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4196 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004197 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004198 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004199 }
4200 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004201 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4202 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004203 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004204 }
4205 }
Victor Stinner551ac952011-11-29 22:58:13 +01004206 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004207 /* first surrogate */
4208 surrogate = outCh;
4209 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004210 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004211 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4212 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004213 }
4214 }
4215 }
4216 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004217 inShift = 0;
4218 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004220 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4221 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004222 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004223 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004224 if (base64bits > 0) { /* left-over bits */
4225 if (base64bits >= 6) {
4226 /* We've seen at least one base-64 character */
4227 errmsg = "partial character in shift sequence";
4228 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004229 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004230 else {
4231 /* Some bits remain; they should be zero */
4232 if (base64buffer != 0) {
4233 errmsg = "non-zero padding bits in shift sequence";
4234 goto utf7Error;
4235 }
4236 }
4237 }
4238 if (ch != '-') {
4239 /* '-' is absorbed; other terminating
4240 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004241 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4242 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004243 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004244 }
4245 }
4246 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004248 s++; /* consume '+' */
4249 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004250 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004251 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4252 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004253 }
4254 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004255 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004256 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004257 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004258 }
4259 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004260 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004261 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4262 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263 s++;
4264 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004265 else {
4266 startinpos = s-starts;
4267 s++;
4268 errmsg = "unexpected special character";
4269 goto utf7Error;
4270 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004273 endinpos = s-starts;
4274 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 errors, &errorHandler,
4276 "utf7", errmsg,
4277 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004278 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004279 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280 }
4281
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282 /* end of string */
4283
4284 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4285 /* if we're in an inconsistent state, that's an error */
4286 if (surrogate ||
4287 (base64bits >= 6) ||
4288 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004289 endinpos = size;
4290 if (unicode_decode_call_errorhandler(
4291 errors, &errorHandler,
4292 "utf7", "unterminated shift sequence",
4293 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004294 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 goto onError;
4296 if (s < e)
4297 goto restart;
4298 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004299 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004300
4301 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004302 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004304 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004305 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306 }
4307 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004308 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004310 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004311
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004312 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 goto onError;
4314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 Py_XDECREF(errorHandler);
4316 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004317 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318
Benjamin Peterson29060642009-01-31 22:14:21 +00004319 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 Py_XDECREF(errorHandler);
4321 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322 Py_DECREF(unicode);
4323 return NULL;
4324}
4325
4326
Alexander Belopolsky40018472011-02-26 01:02:56 +00004327PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004328_PyUnicode_EncodeUTF7(PyObject *str,
4329 int base64SetO,
4330 int base64WhiteSpace,
4331 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004333 int kind;
4334 void *data;
4335 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004336 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004337 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004339 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340 unsigned int base64bits = 0;
4341 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342 char * out;
4343 char * start;
4344
Benjamin Petersonbac79492012-01-14 13:34:47 -05004345 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004346 return NULL;
4347 kind = PyUnicode_KIND(str);
4348 data = PyUnicode_DATA(str);
4349 len = PyUnicode_GET_LENGTH(str);
4350
4351 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004354 /* It might be possible to tighten this worst case */
4355 allocated = 8 * len;
4356 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004357 return PyErr_NoMemory();
4358
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360 if (v == NULL)
4361 return NULL;
4362
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004363 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004364 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004365 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 if (inShift) {
4368 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4369 /* shifting out */
4370 if (base64bits) { /* output remaining bits */
4371 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4372 base64buffer = 0;
4373 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 }
4375 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 /* Characters not in the BASE64 set implicitly unshift the sequence
4377 so no '-' is required, except if the character is itself a '-' */
4378 if (IS_BASE64(ch) || ch == '-') {
4379 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 *out++ = (char) ch;
4382 }
4383 else {
4384 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004385 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 else { /* not in a shift sequence */
4388 if (ch == '+') {
4389 *out++ = '+';
4390 *out++ = '-';
4391 }
4392 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4393 *out++ = (char) ch;
4394 }
4395 else {
4396 *out++ = '+';
4397 inShift = 1;
4398 goto encode_char;
4399 }
4400 }
4401 continue;
4402encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004404 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004405
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 /* code first surrogate */
4407 base64bits += 16;
4408 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4409 while (base64bits >= 6) {
4410 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4411 base64bits -= 6;
4412 }
4413 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004414 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 base64bits += 16;
4417 base64buffer = (base64buffer << 16) | ch;
4418 while (base64bits >= 6) {
4419 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4420 base64bits -= 6;
4421 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004422 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 if (base64bits)
4424 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4425 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004427 if (_PyBytes_Resize(&v, out - start) < 0)
4428 return NULL;
4429 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004431PyObject *
4432PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4433 Py_ssize_t size,
4434 int base64SetO,
4435 int base64WhiteSpace,
4436 const char *errors)
4437{
4438 PyObject *result;
4439 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4440 if (tmp == NULL)
4441 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004442 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004443 base64WhiteSpace, errors);
4444 Py_DECREF(tmp);
4445 return result;
4446}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004447
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448#undef IS_BASE64
4449#undef FROM_BASE64
4450#undef TO_BASE64
4451#undef DECODE_DIRECT
4452#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454/* --- UTF-8 Codec -------------------------------------------------------- */
4455
Tim Petersced69f82003-09-16 20:30:58 +00004456static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004458 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4459 illegal prefix. See RFC 3629 for details */
4460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4464 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4466 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004467 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4472 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4473 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4474 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4475 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476};
4477
Alexander Belopolsky40018472011-02-26 01:02:56 +00004478PyObject *
4479PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004480 Py_ssize_t size,
4481 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482{
Walter Dörwald69652032004-09-07 20:24:22 +00004483 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4484}
4485
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004486#include "stringlib/ucs1lib.h"
4487#include "stringlib/codecs.h"
4488#include "stringlib/undef.h"
4489
4490#include "stringlib/ucs2lib.h"
4491#include "stringlib/codecs.h"
4492#include "stringlib/undef.h"
4493
4494#include "stringlib/ucs4lib.h"
4495#include "stringlib/codecs.h"
4496#include "stringlib/undef.h"
4497
Antoine Pitrouab868312009-01-10 15:40:25 +00004498/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4499#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4500
4501/* Mask to quickly check whether a C 'long' contains a
4502 non-ASCII, UTF8-encoded char. */
4503#if (SIZEOF_LONG == 8)
4504# define ASCII_CHAR_MASK 0x8080808080808080L
4505#elif (SIZEOF_LONG == 4)
4506# define ASCII_CHAR_MASK 0x80808080L
4507#else
4508# error C 'long' size should be either 4 or 8!
4509#endif
4510
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004511/* Scans a UTF-8 string and returns the maximum character to be expected
4512 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004513
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004514 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004515 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004516 */
4517static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004518utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004521 const unsigned char *end = p + string_size;
4522 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004523
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004524 assert(unicode_size != NULL);
4525
4526 /* By having a cascade of independent loops which fallback onto each
4527 other, we minimize the amount of work done in the average loop
4528 iteration, and we also maximize the CPU's ability to predict
4529 branches correctly (because a given condition will have always the
4530 same boolean outcome except perhaps in the last iteration of the
4531 corresponding loop).
4532 In the general case this brings us rather close to decoding
4533 performance pre-PEP 393, despite the two-pass decoding.
4534
4535 Note that the pure ASCII loop is not duplicated once a non-ASCII
4536 character has been encountered. It is actually a pessimization (by
4537 a significant factor) to use this loop on text with many non-ASCII
4538 characters, and it is important to avoid bad performance on valid
4539 utf-8 data (invalid utf-8 being a different can of worms).
4540 */
4541
4542 /* ASCII */
4543 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004544 /* Only check value if it's not a ASCII char... */
4545 if (*p < 0x80) {
4546 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4547 an explanation. */
4548 if (!((size_t) p & LONG_PTR_MASK)) {
4549 /* Help register allocation */
4550 register const unsigned char *_p = p;
4551 while (_p < aligned_end) {
4552 unsigned long value = *(unsigned long *) _p;
4553 if (value & ASCII_CHAR_MASK)
4554 break;
4555 _p += SIZEOF_LONG;
4556 char_count += SIZEOF_LONG;
4557 }
4558 p = _p;
4559 if (p == end)
4560 break;
4561 }
4562 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004563 if (*p < 0x80)
4564 ++char_count;
4565 else
4566 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004567 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004568 *unicode_size = char_count;
4569 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004570
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004571_ucs1loop:
4572 for (; p < end; ++p) {
4573 if (*p < 0xc4)
4574 char_count += ((*p & 0xc0) != 0x80);
4575 else
4576 goto _ucs2loop;
4577 }
4578 *unicode_size = char_count;
4579 return 255;
4580
4581_ucs2loop:
4582 for (; p < end; ++p) {
4583 if (*p < 0xf0)
4584 char_count += ((*p & 0xc0) != 0x80);
4585 else
4586 goto _ucs4loop;
4587 }
4588 *unicode_size = char_count;
4589 return 65535;
4590
4591_ucs4loop:
4592 for (; p < end; ++p) {
4593 char_count += ((*p & 0xc0) != 0x80);
4594 }
4595 *unicode_size = char_count;
4596 return 65537;
4597}
4598
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004599/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004600 in case of errors. Implicit parameters: unicode, kind, data, onError.
4601 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004602*/
Victor Stinner785938e2011-12-11 20:09:03 +01004603#define WRITE_MAYBE_FAIL(index, value) \
4604 do { \
4605 Py_ssize_t pos = index; \
4606 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4607 unicode_resize(&unicode, pos + pos/8) < 0) \
4608 goto onError; \
4609 if (unicode_putchar(&unicode, &pos, value) < 0) \
4610 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004611 } while (0)
4612
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004613static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004614decode_utf8_errors(const char *starts,
4615 Py_ssize_t size,
4616 const char *errors,
4617 Py_ssize_t *consumed,
4618 const char *s,
4619 PyObject *unicode,
4620 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004621{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004623 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004624 Py_ssize_t startinpos;
4625 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004626 const char *e = starts + size;
4627 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004628 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 PyObject *errorHandler = NULL;
4630 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004631
Antoine Pitrouab868312009-01-10 15:40:25 +00004632 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633
4634 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004635 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636
4637 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004638 /* Fast path for runs of ASCII characters. Given that common UTF-8
4639 input will consist of an overwhelming majority of ASCII
4640 characters, we try to optimize for this case by checking
4641 as many characters as a C 'long' can contain.
4642 First, check if we can do an aligned read, as most CPUs have
4643 a penalty for unaligned reads.
4644 */
4645 if (!((size_t) s & LONG_PTR_MASK)) {
4646 /* Help register allocation */
4647 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004648 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004649 while (_s < aligned_end) {
4650 /* Read a whole long at a time (either 4 or 8 bytes),
4651 and do a fast unrolled copy if it only contains ASCII
4652 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653 unsigned long value = *(unsigned long *) _s;
4654 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004655 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004656 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4657 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4658 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4659 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004660#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004661 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4662 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4663 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4664 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004665#endif
4666 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004667 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004668 }
4669 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004670 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004671 if (s == e)
4672 break;
4673 ch = (unsigned char)*s;
4674 }
4675 }
4676
4677 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004678 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 s++;
4680 continue;
4681 }
4682
4683 n = utf8_code_length[ch];
4684
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004685 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 if (consumed)
4687 break;
4688 else {
4689 errmsg = "unexpected end of data";
4690 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004691 endinpos = startinpos+1;
4692 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4693 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 goto utf8Error;
4695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
4698 switch (n) {
4699
4700 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004701 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 startinpos = s-starts;
4703 endinpos = startinpos+1;
4704 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705
4706 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004707 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004708 startinpos = s-starts;
4709 endinpos = startinpos+1;
4710 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711
4712 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004713 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004714 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004716 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004717 goto utf8Error;
4718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004720 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004721 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 break;
4723
4724 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004725 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4726 will result in surrogates in range d800-dfff. Surrogates are
4727 not valid UTF-8 so they are rejected.
4728 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4729 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004730 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004731 (s[2] & 0xc0) != 0x80 ||
4732 ((unsigned char)s[0] == 0xE0 &&
4733 (unsigned char)s[1] < 0xA0) ||
4734 ((unsigned char)s[0] == 0xED &&
4735 (unsigned char)s[1] > 0x9F)) {
4736 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004737 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004738 endinpos = startinpos + 1;
4739
4740 /* if s[1] first two bits are 1 and 0, then the invalid
4741 continuation byte is s[2], so increment endinpos by 1,
4742 if not, s[1] is invalid and endinpos doesn't need to
4743 be incremented. */
4744 if ((s[1] & 0xC0) == 0x80)
4745 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 goto utf8Error;
4747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004749 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004750 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004751 break;
4752
4753 case 4:
4754 if ((s[1] & 0xc0) != 0x80 ||
4755 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004756 (s[3] & 0xc0) != 0x80 ||
4757 ((unsigned char)s[0] == 0xF0 &&
4758 (unsigned char)s[1] < 0x90) ||
4759 ((unsigned char)s[0] == 0xF4 &&
4760 (unsigned char)s[1] > 0x8F)) {
4761 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004763 endinpos = startinpos + 1;
4764 if ((s[1] & 0xC0) == 0x80) {
4765 endinpos++;
4766 if ((s[2] & 0xC0) == 0x80)
4767 endinpos++;
4768 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004769 goto utf8Error;
4770 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004771 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004772 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004773 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004774
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004775 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 }
4778 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004780
Benjamin Peterson29060642009-01-31 22:14:21 +00004781 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 if (unicode_decode_call_errorhandler(
4783 errors, &errorHandler,
4784 "utf8", errmsg,
4785 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004786 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 /* Update data because unicode_decode_call_errorhandler might have
4789 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 }
Walter Dörwald69652032004-09-07 20:24:22 +00004792 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004795 /* Adjust length and ready string when it contained errors and
4796 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004797 if (unicode_resize(&unicode, i) < 0)
4798 goto onError;
4799 unicode_adjust_maxchar(&unicode);
4800 if (unicode == NULL)
4801 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 Py_XDECREF(errorHandler);
4804 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004805 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004806 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 Py_XDECREF(errorHandler);
4810 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004811 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 return NULL;
4813}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004814#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004815
Victor Stinner785938e2011-12-11 20:09:03 +01004816PyObject *
4817PyUnicode_DecodeUTF8Stateful(const char *s,
4818 Py_ssize_t size,
4819 const char *errors,
4820 Py_ssize_t *consumed)
4821{
4822 Py_UCS4 maxchar = 0;
4823 Py_ssize_t unicode_size;
4824 int has_errors = 0;
4825 PyObject *unicode;
4826 int kind;
4827 void *data;
4828 const char *starts = s;
4829 const char *e;
4830 Py_ssize_t i;
4831
4832 if (size == 0) {
4833 if (consumed)
4834 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004835 Py_INCREF(unicode_empty);
4836 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004837 }
4838
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004839 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004840
4841 /* When the string is ASCII only, just use memcpy and return.
4842 unicode_size may be != size if there is an incomplete UTF-8
4843 sequence at the end of the ASCII block. */
4844 if (maxchar < 128 && size == unicode_size) {
4845 if (consumed)
4846 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004847 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004848 }
4849
4850 unicode = PyUnicode_New(unicode_size, maxchar);
4851 if (!unicode)
4852 return NULL;
4853 kind = PyUnicode_KIND(unicode);
4854 data = PyUnicode_DATA(unicode);
4855
4856 /* Unpack UTF-8 encoded data */
4857 i = 0;
4858 e = starts + size;
4859 switch (kind) {
4860 case PyUnicode_1BYTE_KIND:
4861 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4862 break;
4863 case PyUnicode_2BYTE_KIND:
4864 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4865 break;
4866 case PyUnicode_4BYTE_KIND:
4867 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4868 break;
4869 }
4870 if (!has_errors) {
4871 /* Ensure the unicode size calculation was correct */
4872 assert(i == unicode_size);
4873 assert(s == e);
4874 if (consumed)
4875 *consumed = size;
4876 return unicode;
4877 }
4878
4879 /* In case of errors, maxchar and size computation might be incorrect;
4880 code below refits and resizes as necessary. */
4881 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4882}
4883
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004884#ifdef __APPLE__
4885
4886/* Simplified UTF-8 decoder using surrogateescape error handler,
4887 used to decode the command line arguments on Mac OS X. */
4888
4889wchar_t*
4890_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4891{
4892 int n;
4893 const char *e;
4894 wchar_t *unicode, *p;
4895
4896 /* Note: size will always be longer than the resulting Unicode
4897 character count */
4898 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4899 PyErr_NoMemory();
4900 return NULL;
4901 }
4902 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4903 if (!unicode)
4904 return NULL;
4905
4906 /* Unpack UTF-8 encoded data */
4907 p = unicode;
4908 e = s + size;
4909 while (s < e) {
4910 Py_UCS4 ch = (unsigned char)*s;
4911
4912 if (ch < 0x80) {
4913 *p++ = (wchar_t)ch;
4914 s++;
4915 continue;
4916 }
4917
4918 n = utf8_code_length[ch];
4919 if (s + n > e) {
4920 goto surrogateescape;
4921 }
4922
4923 switch (n) {
4924 case 0:
4925 case 1:
4926 goto surrogateescape;
4927
4928 case 2:
4929 if ((s[1] & 0xc0) != 0x80)
4930 goto surrogateescape;
4931 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4932 assert ((ch > 0x007F) && (ch <= 0x07FF));
4933 *p++ = (wchar_t)ch;
4934 break;
4935
4936 case 3:
4937 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4938 will result in surrogates in range d800-dfff. Surrogates are
4939 not valid UTF-8 so they are rejected.
4940 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4941 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4942 if ((s[1] & 0xc0) != 0x80 ||
4943 (s[2] & 0xc0) != 0x80 ||
4944 ((unsigned char)s[0] == 0xE0 &&
4945 (unsigned char)s[1] < 0xA0) ||
4946 ((unsigned char)s[0] == 0xED &&
4947 (unsigned char)s[1] > 0x9F)) {
4948
4949 goto surrogateescape;
4950 }
4951 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4952 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004953 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004954 break;
4955
4956 case 4:
4957 if ((s[1] & 0xc0) != 0x80 ||
4958 (s[2] & 0xc0) != 0x80 ||
4959 (s[3] & 0xc0) != 0x80 ||
4960 ((unsigned char)s[0] == 0xF0 &&
4961 (unsigned char)s[1] < 0x90) ||
4962 ((unsigned char)s[0] == 0xF4 &&
4963 (unsigned char)s[1] > 0x8F)) {
4964 goto surrogateescape;
4965 }
4966 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4967 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004968 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004969
4970#if SIZEOF_WCHAR_T == 4
4971 *p++ = (wchar_t)ch;
4972#else
4973 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004974 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4975 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004976#endif
4977 break;
4978 }
4979 s += n;
4980 continue;
4981
4982 surrogateescape:
4983 *p++ = 0xDC00 + ch;
4984 s++;
4985 }
4986 *p = L'\0';
4987 return unicode;
4988}
4989
4990#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004992/* Primary internal function which creates utf8 encoded bytes objects.
4993
4994 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004995 and allocate exactly as much space needed at the end. Else allocate the
4996 maximum possible needed (4 result bytes per Unicode character), and return
4997 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004998*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004999PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005000_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001{
Victor Stinner6099a032011-12-18 14:22:26 +01005002 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005003 void *data;
5004 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005006 if (!PyUnicode_Check(unicode)) {
5007 PyErr_BadArgument();
5008 return NULL;
5009 }
5010
5011 if (PyUnicode_READY(unicode) == -1)
5012 return NULL;
5013
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005014 if (PyUnicode_UTF8(unicode))
5015 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5016 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005017
5018 kind = PyUnicode_KIND(unicode);
5019 data = PyUnicode_DATA(unicode);
5020 size = PyUnicode_GET_LENGTH(unicode);
5021
Benjamin Petersonead6b532011-12-20 17:23:42 -06005022 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005023 default:
5024 assert(0);
5025 case PyUnicode_1BYTE_KIND:
5026 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5027 assert(!PyUnicode_IS_ASCII(unicode));
5028 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5029 case PyUnicode_2BYTE_KIND:
5030 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5031 case PyUnicode_4BYTE_KIND:
5032 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034}
5035
Alexander Belopolsky40018472011-02-26 01:02:56 +00005036PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5038 Py_ssize_t size,
5039 const char *errors)
5040{
5041 PyObject *v, *unicode;
5042
5043 unicode = PyUnicode_FromUnicode(s, size);
5044 if (unicode == NULL)
5045 return NULL;
5046 v = _PyUnicode_AsUTF8String(unicode, errors);
5047 Py_DECREF(unicode);
5048 return v;
5049}
5050
5051PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005052PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005054 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055}
5056
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057/* --- UTF-32 Codec ------------------------------------------------------- */
5058
5059PyObject *
5060PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 Py_ssize_t size,
5062 const char *errors,
5063 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005064{
5065 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5066}
5067
5068PyObject *
5069PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 Py_ssize_t size,
5071 const char *errors,
5072 int *byteorder,
5073 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005074{
5075 const char *starts = s;
5076 Py_ssize_t startinpos;
5077 Py_ssize_t endinpos;
5078 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005079 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005080 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081 int bo = 0; /* assume native ordering by default */
5082 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083 /* Offsets from q for retrieving bytes in the right order. */
5084#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5085 int iorder[] = {0, 1, 2, 3};
5086#else
5087 int iorder[] = {3, 2, 1, 0};
5088#endif
5089 PyObject *errorHandler = NULL;
5090 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005091
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092 q = (unsigned char *)s;
5093 e = q + size;
5094
5095 if (byteorder)
5096 bo = *byteorder;
5097
5098 /* Check for BOM marks (U+FEFF) in the input and adjust current
5099 byte order setting accordingly. In native mode, the leading BOM
5100 mark is skipped, in all other modes, it is copied to the output
5101 stream as-is (giving a ZWNBSP character). */
5102 if (bo == 0) {
5103 if (size >= 4) {
5104 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 if (bom == 0x0000FEFF) {
5108 q += 4;
5109 bo = -1;
5110 }
5111 else if (bom == 0xFFFE0000) {
5112 q += 4;
5113 bo = 1;
5114 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005115#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 if (bom == 0x0000FEFF) {
5117 q += 4;
5118 bo = 1;
5119 }
5120 else if (bom == 0xFFFE0000) {
5121 q += 4;
5122 bo = -1;
5123 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005124#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005126 }
5127
5128 if (bo == -1) {
5129 /* force LE */
5130 iorder[0] = 0;
5131 iorder[1] = 1;
5132 iorder[2] = 2;
5133 iorder[3] = 3;
5134 }
5135 else if (bo == 1) {
5136 /* force BE */
5137 iorder[0] = 3;
5138 iorder[1] = 2;
5139 iorder[2] = 1;
5140 iorder[3] = 0;
5141 }
5142
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005143 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005144 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005145 if (!unicode)
5146 return NULL;
5147 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005148 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005149 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005150
Walter Dörwald41980ca2007-08-16 21:55:45 +00005151 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 Py_UCS4 ch;
5153 /* remaining bytes at the end? (size should be divisible by 4) */
5154 if (e-q<4) {
5155 if (consumed)
5156 break;
5157 errmsg = "truncated data";
5158 startinpos = ((const char *)q)-starts;
5159 endinpos = ((const char *)e)-starts;
5160 goto utf32Error;
5161 /* The remaining input chars are ignored if the callback
5162 chooses to skip the input */
5163 }
5164 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5165 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 if (ch >= 0x110000)
5168 {
5169 errmsg = "codepoint not in range(0x110000)";
5170 startinpos = ((const char *)q)-starts;
5171 endinpos = startinpos+4;
5172 goto utf32Error;
5173 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005174 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5175 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 q += 4;
5177 continue;
5178 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 if (unicode_decode_call_errorhandler(
5180 errors, &errorHandler,
5181 "utf32", errmsg,
5182 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005183 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005185 }
5186
5187 if (byteorder)
5188 *byteorder = bo;
5189
5190 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005192
5193 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005194 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005195 goto onError;
5196
5197 Py_XDECREF(errorHandler);
5198 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005199 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005202 Py_DECREF(unicode);
5203 Py_XDECREF(errorHandler);
5204 Py_XDECREF(exc);
5205 return NULL;
5206}
5207
5208PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005209_PyUnicode_EncodeUTF32(PyObject *str,
5210 const char *errors,
5211 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005212{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005213 int kind;
5214 void *data;
5215 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005216 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005217 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005218 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219 /* Offsets from p for storing byte pairs in the right order. */
5220#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5221 int iorder[] = {0, 1, 2, 3};
5222#else
5223 int iorder[] = {3, 2, 1, 0};
5224#endif
5225
Benjamin Peterson29060642009-01-31 22:14:21 +00005226#define STORECHAR(CH) \
5227 do { \
5228 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5229 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5230 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5231 p[iorder[0]] = (CH) & 0xff; \
5232 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 } while(0)
5234
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005235 if (!PyUnicode_Check(str)) {
5236 PyErr_BadArgument();
5237 return NULL;
5238 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005239 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005240 return NULL;
5241 kind = PyUnicode_KIND(str);
5242 data = PyUnicode_DATA(str);
5243 len = PyUnicode_GET_LENGTH(str);
5244
5245 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005246 bytesize = nsize * 4;
5247 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005249 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005250 if (v == NULL)
5251 return NULL;
5252
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005253 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005254 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005256 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005257 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005258
5259 if (byteorder == -1) {
5260 /* force LE */
5261 iorder[0] = 0;
5262 iorder[1] = 1;
5263 iorder[2] = 2;
5264 iorder[3] = 3;
5265 }
5266 else if (byteorder == 1) {
5267 /* force BE */
5268 iorder[0] = 3;
5269 iorder[1] = 2;
5270 iorder[2] = 1;
5271 iorder[3] = 0;
5272 }
5273
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005274 for (i = 0; i < len; i++)
5275 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005276
5277 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005278 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005279#undef STORECHAR
5280}
5281
Alexander Belopolsky40018472011-02-26 01:02:56 +00005282PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005283PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5284 Py_ssize_t size,
5285 const char *errors,
5286 int byteorder)
5287{
5288 PyObject *result;
5289 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5290 if (tmp == NULL)
5291 return NULL;
5292 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5293 Py_DECREF(tmp);
5294 return result;
5295}
5296
5297PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005298PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005299{
Victor Stinnerb960b342011-11-20 19:12:52 +01005300 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005301}
5302
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303/* --- UTF-16 Codec ------------------------------------------------------- */
5304
Tim Peters772747b2001-08-09 22:21:55 +00005305PyObject *
5306PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 Py_ssize_t size,
5308 const char *errors,
5309 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310{
Walter Dörwald69652032004-09-07 20:24:22 +00005311 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5312}
5313
Antoine Pitrouab868312009-01-10 15:40:25 +00005314/* Two masks for fast checking of whether a C 'long' may contain
5315 UTF16-encoded surrogate characters. This is an efficient heuristic,
5316 assuming that non-surrogate characters with a code point >= 0x8000 are
5317 rare in most input.
5318 FAST_CHAR_MASK is used when the input is in native byte ordering,
5319 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005320*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005321#if (SIZEOF_LONG == 8)
5322# define FAST_CHAR_MASK 0x8000800080008000L
5323# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5324#elif (SIZEOF_LONG == 4)
5325# define FAST_CHAR_MASK 0x80008000L
5326# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5327#else
5328# error C 'long' size should be either 4 or 8!
5329#endif
5330
Walter Dörwald69652032004-09-07 20:24:22 +00005331PyObject *
5332PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 Py_ssize_t size,
5334 const char *errors,
5335 int *byteorder,
5336 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005339 Py_ssize_t startinpos;
5340 Py_ssize_t endinpos;
5341 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005342 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005343 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005344 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005345 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005346 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005347 /* Offsets from q for retrieving byte pairs in the right order. */
5348#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5349 int ihi = 1, ilo = 0;
5350#else
5351 int ihi = 0, ilo = 1;
5352#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 PyObject *errorHandler = NULL;
5354 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355
5356 /* Note: size will always be longer than the resulting Unicode
5357 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005358 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 if (!unicode)
5360 return NULL;
5361 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005362 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005363 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364
Tim Peters772747b2001-08-09 22:21:55 +00005365 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005366 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367
5368 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005369 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005371 /* Check for BOM marks (U+FEFF) in the input and adjust current
5372 byte order setting accordingly. In native mode, the leading BOM
5373 mark is skipped, in all other modes, it is copied to the output
5374 stream as-is (giving a ZWNBSP character). */
5375 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005376 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005377 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005378#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 if (bom == 0xFEFF) {
5380 q += 2;
5381 bo = -1;
5382 }
5383 else if (bom == 0xFFFE) {
5384 q += 2;
5385 bo = 1;
5386 }
Tim Petersced69f82003-09-16 20:30:58 +00005387#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 if (bom == 0xFEFF) {
5389 q += 2;
5390 bo = 1;
5391 }
5392 else if (bom == 0xFFFE) {
5393 q += 2;
5394 bo = -1;
5395 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005396#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399
Tim Peters772747b2001-08-09 22:21:55 +00005400 if (bo == -1) {
5401 /* force LE */
5402 ihi = 1;
5403 ilo = 0;
5404 }
5405 else if (bo == 1) {
5406 /* force BE */
5407 ihi = 0;
5408 ilo = 1;
5409 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005410#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5411 native_ordering = ilo < ihi;
5412#else
5413 native_ordering = ilo > ihi;
5414#endif
Tim Peters772747b2001-08-09 22:21:55 +00005415
Antoine Pitrouab868312009-01-10 15:40:25 +00005416 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005417 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005418 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005419 /* First check for possible aligned read of a C 'long'. Unaligned
5420 reads are more expensive, better to defer to another iteration. */
5421 if (!((size_t) q & LONG_PTR_MASK)) {
5422 /* Fast path for runs of non-surrogate chars. */
5423 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005424 int kind = PyUnicode_KIND(unicode);
5425 void *data = PyUnicode_DATA(unicode);
5426 while (_q < aligned_end) {
5427 unsigned long block = * (unsigned long *) _q;
5428 unsigned short *pblock = (unsigned short*)&block;
5429 Py_UCS4 maxch;
5430 if (native_ordering) {
5431 /* Can use buffer directly */
5432 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005433 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005434 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005435 else {
5436 /* Need to byte-swap */
5437 unsigned char *_p = (unsigned char*)pblock;
5438 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005439 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005440 _p[0] = _q[1];
5441 _p[1] = _q[0];
5442 _p[2] = _q[3];
5443 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005444#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005445 _p[4] = _q[5];
5446 _p[5] = _q[4];
5447 _p[6] = _q[7];
5448 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005449#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005450 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005451 maxch = Py_MAX(pblock[0], pblock[1]);
5452#if SIZEOF_LONG == 8
5453 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5454#endif
5455 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5456 if (unicode_widen(&unicode, maxch) < 0)
5457 goto onError;
5458 kind = PyUnicode_KIND(unicode);
5459 data = PyUnicode_DATA(unicode);
5460 }
5461 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5462 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5463#if SIZEOF_LONG == 8
5464 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5465 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5466#endif
5467 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005468 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005469 q = _q;
5470 if (q >= e)
5471 break;
5472 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474
Benjamin Peterson14339b62009-01-31 16:36:08 +00005475 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005476
Victor Stinner551ac952011-11-29 22:58:13 +01005477 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005478 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5479 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 continue;
5481 }
5482
5483 /* UTF-16 code pair: */
5484 if (q > e) {
5485 errmsg = "unexpected end of data";
5486 startinpos = (((const char *)q) - 2) - starts;
5487 endinpos = ((const char *)e) + 1 - starts;
5488 goto utf16Error;
5489 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005490 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5491 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005493 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005494 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005495 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005496 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 continue;
5498 }
5499 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005500 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 startinpos = (((const char *)q)-4)-starts;
5502 endinpos = startinpos+2;
5503 goto utf16Error;
5504 }
5505
Benjamin Peterson14339b62009-01-31 16:36:08 +00005506 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 errmsg = "illegal encoding";
5508 startinpos = (((const char *)q)-2)-starts;
5509 endinpos = startinpos+2;
5510 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005511
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005514 errors,
5515 &errorHandler,
5516 "utf16", errmsg,
5517 &starts,
5518 (const char **)&e,
5519 &startinpos,
5520 &endinpos,
5521 &exc,
5522 (const char **)&q,
5523 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005524 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005527 /* remaining byte at the end? (size should be even) */
5528 if (e == q) {
5529 if (!consumed) {
5530 errmsg = "truncated data";
5531 startinpos = ((const char *)q) - starts;
5532 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005533 if (unicode_decode_call_errorhandler(
5534 errors,
5535 &errorHandler,
5536 "utf16", errmsg,
5537 &starts,
5538 (const char **)&e,
5539 &startinpos,
5540 &endinpos,
5541 &exc,
5542 (const char **)&q,
5543 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005544 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005545 goto onError;
5546 /* The remaining input chars are ignored if the callback
5547 chooses to skip the input */
5548 }
5549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550
5551 if (byteorder)
5552 *byteorder = bo;
5553
Walter Dörwald69652032004-09-07 20:24:22 +00005554 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005556
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005558 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 goto onError;
5560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 Py_XDECREF(errorHandler);
5562 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005563 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 Py_XDECREF(errorHandler);
5568 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 return NULL;
5570}
5571
Antoine Pitrouab868312009-01-10 15:40:25 +00005572#undef FAST_CHAR_MASK
5573#undef SWAPPED_FAST_CHAR_MASK
5574
Tim Peters772747b2001-08-09 22:21:55 +00005575PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005576_PyUnicode_EncodeUTF16(PyObject *str,
5577 const char *errors,
5578 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005580 int kind;
5581 void *data;
5582 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005583 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005584 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005585 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005586 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005587 /* Offsets from p for storing byte pairs in the right order. */
5588#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5589 int ihi = 1, ilo = 0;
5590#else
5591 int ihi = 0, ilo = 1;
5592#endif
5593
Benjamin Peterson29060642009-01-31 22:14:21 +00005594#define STORECHAR(CH) \
5595 do { \
5596 p[ihi] = ((CH) >> 8) & 0xff; \
5597 p[ilo] = (CH) & 0xff; \
5598 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005599 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005601 if (!PyUnicode_Check(str)) {
5602 PyErr_BadArgument();
5603 return NULL;
5604 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005605 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005606 return NULL;
5607 kind = PyUnicode_KIND(str);
5608 data = PyUnicode_DATA(str);
5609 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005610
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005611 pairs = 0;
5612 if (kind == PyUnicode_4BYTE_KIND)
5613 for (i = 0; i < len; i++)
5614 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5615 pairs++;
5616 /* 2 * (len + pairs + (byteorder == 0)) */
5617 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005620 bytesize = nsize * 2;
5621 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005623 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 if (v == NULL)
5625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005627 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005630 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005631 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005632
5633 if (byteorder == -1) {
5634 /* force LE */
5635 ihi = 1;
5636 ilo = 0;
5637 }
5638 else if (byteorder == 1) {
5639 /* force BE */
5640 ihi = 0;
5641 ilo = 1;
5642 }
5643
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005644 for (i = 0; i < len; i++) {
5645 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5646 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005648 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5649 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 }
Tim Peters772747b2001-08-09 22:21:55 +00005651 STORECHAR(ch);
5652 if (ch2)
5653 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005654 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005655
5656 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005657 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005658#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659}
5660
Alexander Belopolsky40018472011-02-26 01:02:56 +00005661PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005662PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5663 Py_ssize_t size,
5664 const char *errors,
5665 int byteorder)
5666{
5667 PyObject *result;
5668 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5669 if (tmp == NULL)
5670 return NULL;
5671 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5672 Py_DECREF(tmp);
5673 return result;
5674}
5675
5676PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005677PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005679 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680}
5681
5682/* --- Unicode Escape Codec ----------------------------------------------- */
5683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5685 if all the escapes in the string make it still a valid ASCII string.
5686 Returns -1 if any escapes were found which cause the string to
5687 pop out of ASCII range. Otherwise returns the length of the
5688 required buffer to hold the string.
5689 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005690static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005691length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5692{
5693 const unsigned char *p = (const unsigned char *)s;
5694 const unsigned char *end = p + size;
5695 Py_ssize_t length = 0;
5696
5697 if (size < 0)
5698 return -1;
5699
5700 for (; p < end; ++p) {
5701 if (*p > 127) {
5702 /* Non-ASCII */
5703 return -1;
5704 }
5705 else if (*p != '\\') {
5706 /* Normal character */
5707 ++length;
5708 }
5709 else {
5710 /* Backslash-escape, check next char */
5711 ++p;
5712 /* Escape sequence reaches till end of string or
5713 non-ASCII follow-up. */
5714 if (p >= end || *p > 127)
5715 return -1;
5716 switch (*p) {
5717 case '\n':
5718 /* backslash + \n result in zero characters */
5719 break;
5720 case '\\': case '\'': case '\"':
5721 case 'b': case 'f': case 't':
5722 case 'n': case 'r': case 'v': case 'a':
5723 ++length;
5724 break;
5725 case '0': case '1': case '2': case '3':
5726 case '4': case '5': case '6': case '7':
5727 case 'x': case 'u': case 'U': case 'N':
5728 /* these do not guarantee ASCII characters */
5729 return -1;
5730 default:
5731 /* count the backslash + the other character */
5732 length += 2;
5733 }
5734 }
5735 }
5736 return length;
5737}
5738
Fredrik Lundh06d12682001-01-24 07:59:11 +00005739static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005740
Alexander Belopolsky40018472011-02-26 01:02:56 +00005741PyObject *
5742PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005743 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005747 Py_ssize_t startinpos;
5748 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005749 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005750 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005752 char* message;
5753 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754 PyObject *errorHandler = NULL;
5755 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005756 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005757 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005758
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005760
5761 /* After length_of_escaped_ascii_string() there are two alternatives,
5762 either the string is pure ASCII with named escapes like \n, etc.
5763 and we determined it's exact size (common case)
5764 or it contains \x, \u, ... escape sequences. then we create a
5765 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 if (len >= 0) {
5767 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005768 if (!v)
5769 goto onError;
5770 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005771 }
5772 else {
5773 /* Escaped strings will always be longer than the resulting
5774 Unicode string, so we start with size here and then reduce the
5775 length after conversion to the true value.
5776 (but if the error callback returns a long replacement string
5777 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005778 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005779 if (!v)
5780 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005781 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005782 }
5783
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005785 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005788
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 while (s < end) {
5790 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005791 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005794 /* The only case in which i == ascii_length is a backslash
5795 followed by a newline. */
5796 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 /* Non-escape characters are interpreted as Unicode ordinals */
5799 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005800 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5801 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 continue;
5803 }
5804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 /* \ - Escapes */
5807 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005808 c = *s++;
5809 if (s > end)
5810 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005811
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005812 /* The only case in which i == ascii_length is a backslash
5813 followed by a newline. */
5814 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005815
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005816 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005819#define WRITECHAR(ch) \
5820 do { \
5821 if (unicode_putchar(&v, &i, ch) < 0) \
5822 goto onError; \
5823 }while(0)
5824
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005826 case '\\': WRITECHAR('\\'); break;
5827 case '\'': WRITECHAR('\''); break;
5828 case '\"': WRITECHAR('\"'); break;
5829 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005830 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005831 case 'f': WRITECHAR('\014'); break;
5832 case 't': WRITECHAR('\t'); break;
5833 case 'n': WRITECHAR('\n'); break;
5834 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005835 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005836 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005838 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 case '0': case '1': case '2': case '3':
5842 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005843 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005844 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005845 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005846 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005847 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005849 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 break;
5851
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 /* hex escapes */
5853 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005855 digits = 2;
5856 message = "truncated \\xXX escape";
5857 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005861 digits = 4;
5862 message = "truncated \\uXXXX escape";
5863 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005866 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005867 digits = 8;
5868 message = "truncated \\UXXXXXXXX escape";
5869 hexescape:
5870 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 if (s+digits>end) {
5872 endinpos = size;
5873 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 errors, &errorHandler,
5875 "unicodeescape", "end of string in escape sequence",
5876 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005877 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 goto onError;
5879 goto nextByte;
5880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005881 for (j = 0; j < digits; ++j) {
5882 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005883 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005884 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 errors, &errorHandler,
5887 "unicodeescape", message,
5888 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005889 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005890 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005891 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005892 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005893 }
5894 chr = (chr<<4) & ~0xF;
5895 if (c >= '0' && c <= '9')
5896 chr += c - '0';
5897 else if (c >= 'a' && c <= 'f')
5898 chr += 10 + c - 'a';
5899 else
5900 chr += 10 + c - 'A';
5901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005902 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005903 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 /* _decoding_error will have already written into the
5905 target buffer. */
5906 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005907 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005908 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005909 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005910 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005911 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005912 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 errors, &errorHandler,
5915 "unicodeescape", "illegal Unicode character",
5916 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005917 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005918 goto onError;
5919 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005920 break;
5921
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005923 case 'N':
5924 message = "malformed \\N character escape";
5925 if (ucnhash_CAPI == NULL) {
5926 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005927 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5928 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005929 if (ucnhash_CAPI == NULL)
5930 goto ucnhashError;
5931 }
5932 if (*s == '{') {
5933 const char *start = s+1;
5934 /* look for the closing brace */
5935 while (*s != '}' && s < end)
5936 s++;
5937 if (s > start && s < end && *s == '}') {
5938 /* found a name. look it up in the unicode database */
5939 message = "unknown Unicode character name";
5940 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005941 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005942 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005943 goto store;
5944 }
5945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 errors, &errorHandler,
5949 "unicodeescape", message,
5950 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005951 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005952 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005953 break;
5954
5955 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005956 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005957 message = "\\ at end of string";
5958 s--;
5959 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 errors, &errorHandler,
5962 "unicodeescape", message,
5963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005964 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005965 goto onError;
5966 }
5967 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005968 WRITECHAR('\\');
5969 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005970 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005971 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005974 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005976#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005977
Victor Stinner16e6a802011-12-12 13:24:15 +01005978 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005979 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005980 Py_XDECREF(errorHandler);
5981 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005982 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005983
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005985 PyErr_SetString(
5986 PyExc_UnicodeError,
5987 "\\N escapes not supported (can't load unicodedata module)"
5988 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005989 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005990 Py_XDECREF(errorHandler);
5991 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005992 return NULL;
5993
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 Py_XDECREF(errorHandler);
5997 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 return NULL;
5999}
6000
6001/* Return a Unicode-Escape string version of the Unicode object.
6002
6003 If quotes is true, the string is enclosed in u"" or u'' quotes as
6004 appropriate.
6005
6006*/
6007
Alexander Belopolsky40018472011-02-26 01:02:56 +00006008PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006011 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006012 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006014 int kind;
6015 void *data;
6016 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Thomas Wouters89f507f2006-12-13 04:49:30 +00006018 /* Initial allocation is based on the longest-possible unichr
6019 escape.
6020
6021 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6022 unichr, so in this case it's the longest unichr escape. In
6023 narrow (UTF-16) builds this is five chars per source unichr
6024 since there are two unichrs in the surrogate pair, so in narrow
6025 (UTF-16) builds it's not the longest unichr escape.
6026
6027 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6028 so in the narrow (UTF-16) build case it's the longest unichr
6029 escape.
6030 */
6031
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006032 if (!PyUnicode_Check(unicode)) {
6033 PyErr_BadArgument();
6034 return NULL;
6035 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006036 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006037 return NULL;
6038 len = PyUnicode_GET_LENGTH(unicode);
6039 kind = PyUnicode_KIND(unicode);
6040 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006041 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006042 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6043 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6044 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6045 }
6046
6047 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006048 return PyBytes_FromStringAndSize(NULL, 0);
6049
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006050 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006052
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006053 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006055 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 if (repr == NULL)
6058 return NULL;
6059
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006060 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006062 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006063 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006064
Walter Dörwald79e913e2007-05-12 11:08:06 +00006065 /* Escape backslashes */
6066 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 *p++ = '\\';
6068 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006069 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006070 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006071
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006072 /* Map 21-bit characters to '\U00xxxxxx' */
6073 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006074 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006075 *p++ = '\\';
6076 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006077 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6078 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6079 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6080 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6081 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6082 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6083 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6084 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006086 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006087
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006089 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 *p++ = '\\';
6091 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006092 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6093 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6094 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6095 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006097
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006098 /* Map special whitespace to '\t', \n', '\r' */
6099 else if (ch == '\t') {
6100 *p++ = '\\';
6101 *p++ = 't';
6102 }
6103 else if (ch == '\n') {
6104 *p++ = '\\';
6105 *p++ = 'n';
6106 }
6107 else if (ch == '\r') {
6108 *p++ = '\\';
6109 *p++ = 'r';
6110 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006111
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006112 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006113 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006115 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006116 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6117 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006118 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 /* Copy everything else as-is */
6121 else
6122 *p++ = (char) ch;
6123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006125 assert(p - PyBytes_AS_STRING(repr) > 0);
6126 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6127 return NULL;
6128 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129}
6130
Alexander Belopolsky40018472011-02-26 01:02:56 +00006131PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006132PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6133 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006135 PyObject *result;
6136 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6137 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139 result = PyUnicode_AsUnicodeEscapeString(tmp);
6140 Py_DECREF(tmp);
6141 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142}
6143
6144/* --- Raw Unicode Escape Codec ------------------------------------------- */
6145
Alexander Belopolsky40018472011-02-26 01:02:56 +00006146PyObject *
6147PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006148 Py_ssize_t size,
6149 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006152 Py_ssize_t startinpos;
6153 Py_ssize_t endinpos;
6154 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006155 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 const char *end;
6157 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 PyObject *errorHandler = NULL;
6159 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006160
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 /* Escaped strings will always be longer than the resulting
6162 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 length after conversion to the true value. (But decoding error
6164 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006165 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006169 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006170 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 end = s + size;
6172 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 unsigned char c;
6174 Py_UCS4 x;
6175 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006176 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 /* Non-escape characters are interpreted as Unicode ordinals */
6179 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006180 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6181 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006183 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 startinpos = s-starts;
6185
6186 /* \u-escapes are only interpreted iff the number of leading
6187 backslashes if odd */
6188 bs = s;
6189 for (;s < end;) {
6190 if (*s != '\\')
6191 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006192 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6193 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 }
6195 if (((s - bs) & 1) == 0 ||
6196 s >= end ||
6197 (*s != 'u' && *s != 'U')) {
6198 continue;
6199 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006200 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 count = *s=='u' ? 4 : 8;
6202 s++;
6203
6204 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 for (x = 0, i = 0; i < count; ++i, ++s) {
6206 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006207 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 endinpos = s-starts;
6209 if (unicode_decode_call_errorhandler(
6210 errors, &errorHandler,
6211 "rawunicodeescape", "truncated \\uXXXX",
6212 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006213 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 goto onError;
6215 goto nextByte;
6216 }
6217 x = (x<<4) & ~0xF;
6218 if (c >= '0' && c <= '9')
6219 x += c - '0';
6220 else if (c >= 'a' && c <= 'f')
6221 x += 10 + c - 'a';
6222 else
6223 x += 10 + c - 'A';
6224 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006225 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006226 if (unicode_putchar(&v, &outpos, x) < 0)
6227 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006228 } else {
6229 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006230 if (unicode_decode_call_errorhandler(
6231 errors, &errorHandler,
6232 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006234 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006236 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 nextByte:
6238 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006240 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006242 Py_XDECREF(errorHandler);
6243 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006244 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006245
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006248 Py_XDECREF(errorHandler);
6249 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 return NULL;
6251}
6252
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006253
Alexander Belopolsky40018472011-02-26 01:02:56 +00006254PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006255PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006257 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 char *p;
6259 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006260 Py_ssize_t expandsize, pos;
6261 int kind;
6262 void *data;
6263 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006265 if (!PyUnicode_Check(unicode)) {
6266 PyErr_BadArgument();
6267 return NULL;
6268 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006269 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006270 return NULL;
6271 kind = PyUnicode_KIND(unicode);
6272 data = PyUnicode_DATA(unicode);
6273 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006274 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6275 bytes, and 1 byte characters 4. */
6276 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006277
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006280
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006281 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 if (repr == NULL)
6283 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006284 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006285 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006287 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006288 for (pos = 0; pos < len; pos++) {
6289 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 /* Map 32-bit characters to '\Uxxxxxxxx' */
6291 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006292 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006293 *p++ = '\\';
6294 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006295 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6296 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6297 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6298 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6299 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6300 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6301 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6302 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006305 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 *p++ = '\\';
6307 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006308 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6309 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6310 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6311 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 /* Copy everything else as-is */
6314 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 *p++ = (char) ch;
6316 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006317
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318 assert(p > q);
6319 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006320 return NULL;
6321 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322}
6323
Alexander Belopolsky40018472011-02-26 01:02:56 +00006324PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6326 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006328 PyObject *result;
6329 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6330 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006331 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006332 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6333 Py_DECREF(tmp);
6334 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006337/* --- Unicode Internal Codec ------------------------------------------- */
6338
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339PyObject *
6340_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006341 Py_ssize_t size,
6342 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006343{
6344 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006345 Py_ssize_t startinpos;
6346 Py_ssize_t endinpos;
6347 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006348 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006349 const char *end;
6350 const char *reason;
6351 PyObject *errorHandler = NULL;
6352 PyObject *exc = NULL;
6353
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006354 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006355 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006356 1))
6357 return NULL;
6358
Thomas Wouters89f507f2006-12-13 04:49:30 +00006359 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006360 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006361 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006363 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006364 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006365 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006366 end = s + size;
6367
6368 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006369 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006370 Py_UCS4 ch;
6371 /* We copy the raw representation one byte at a time because the
6372 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006373 ((char *) &uch)[0] = s[0];
6374 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006375#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006376 ((char *) &uch)[2] = s[2];
6377 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006378#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006379 ch = uch;
6380
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006381 /* We have to sanity check the raw data, otherwise doom looms for
6382 some malformed UCS-4 data. */
6383 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006384#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006385 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006386#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006387 end-s < Py_UNICODE_SIZE
6388 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006390 startinpos = s - starts;
6391 if (end-s < Py_UNICODE_SIZE) {
6392 endinpos = end-starts;
6393 reason = "truncated input";
6394 }
6395 else {
6396 endinpos = s - starts + Py_UNICODE_SIZE;
6397 reason = "illegal code point (> 0x10FFFF)";
6398 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006399 if (unicode_decode_call_errorhandler(
6400 errors, &errorHandler,
6401 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006402 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006403 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006404 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006405 continue;
6406 }
6407
6408 s += Py_UNICODE_SIZE;
6409#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006410 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006411 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006412 Py_UNICODE uch2;
6413 ((char *) &uch2)[0] = s[0];
6414 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006415 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006416 {
Victor Stinner551ac952011-11-29 22:58:13 +01006417 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006418 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006419 }
6420 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006421#endif
6422
6423 if (unicode_putchar(&v, &outpos, ch) < 0)
6424 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006425 }
6426
Victor Stinner16e6a802011-12-12 13:24:15 +01006427 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006428 goto onError;
6429 Py_XDECREF(errorHandler);
6430 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006431 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006432
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006434 Py_XDECREF(v);
6435 Py_XDECREF(errorHandler);
6436 Py_XDECREF(exc);
6437 return NULL;
6438}
6439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440/* --- Latin-1 Codec ------------------------------------------------------ */
6441
Alexander Belopolsky40018472011-02-26 01:02:56 +00006442PyObject *
6443PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006444 Py_ssize_t size,
6445 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006448 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449}
6450
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006452static void
6453make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006454 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006455 PyObject *unicode,
6456 Py_ssize_t startpos, Py_ssize_t endpos,
6457 const char *reason)
6458{
6459 if (*exceptionObject == NULL) {
6460 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006462 encoding, unicode, startpos, endpos, reason);
6463 }
6464 else {
6465 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6466 goto onError;
6467 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6468 goto onError;
6469 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6470 goto onError;
6471 return;
6472 onError:
6473 Py_DECREF(*exceptionObject);
6474 *exceptionObject = NULL;
6475 }
6476}
6477
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006479static void
6480raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006481 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006482 PyObject *unicode,
6483 Py_ssize_t startpos, Py_ssize_t endpos,
6484 const char *reason)
6485{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006486 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006487 encoding, unicode, startpos, endpos, reason);
6488 if (*exceptionObject != NULL)
6489 PyCodec_StrictErrors(*exceptionObject);
6490}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491
6492/* error handling callback helper:
6493 build arguments, call the callback and check the arguments,
6494 put the result into newpos and return the replacement string, which
6495 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006496static PyObject *
6497unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006498 PyObject **errorHandler,
6499 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006501 Py_ssize_t startpos, Py_ssize_t endpos,
6502 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006503{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006504 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506 PyObject *restuple;
6507 PyObject *resunicode;
6508
6509 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006511 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 }
6514
Benjamin Petersonbac79492012-01-14 13:34:47 -05006515 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 return NULL;
6517 len = PyUnicode_GET_LENGTH(unicode);
6518
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006519 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006521 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523
6524 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006528 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006529 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 Py_DECREF(restuple);
6531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006532 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006533 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 &resunicode, newpos)) {
6535 Py_DECREF(restuple);
6536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006537 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006538 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6539 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6540 Py_DECREF(restuple);
6541 return NULL;
6542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 *newpos = len + *newpos;
6545 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6547 Py_DECREF(restuple);
6548 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006549 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 Py_INCREF(resunicode);
6551 Py_DECREF(restuple);
6552 return resunicode;
6553}
6554
Alexander Belopolsky40018472011-02-26 01:02:56 +00006555static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006557 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006558 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 /* input state */
6561 Py_ssize_t pos=0, size;
6562 int kind;
6563 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006564 /* output object */
6565 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 /* pointer into the output */
6567 char *str;
6568 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006570 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6571 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572 PyObject *errorHandler = NULL;
6573 PyObject *exc = NULL;
6574 /* the following variable is used for caching string comparisons
6575 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6576 int known_errorHandler = -1;
6577
Benjamin Petersonbac79492012-01-14 13:34:47 -05006578 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006579 return NULL;
6580 size = PyUnicode_GET_LENGTH(unicode);
6581 kind = PyUnicode_KIND(unicode);
6582 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006583 /* allocate enough for a simple encoding without
6584 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006585 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006586 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006587 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006589 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006590 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006591 ressize = size;
6592
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006593 while (pos < size) {
6594 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 /* can we encode this? */
6597 if (c<limit) {
6598 /* no overflow check, because we know that the space is enough */
6599 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006600 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 Py_ssize_t requiredsize;
6604 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 Py_ssize_t collstart = pos;
6608 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006610 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 ++collend;
6612 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6613 if (known_errorHandler==-1) {
6614 if ((errors==NULL) || (!strcmp(errors, "strict")))
6615 known_errorHandler = 1;
6616 else if (!strcmp(errors, "replace"))
6617 known_errorHandler = 2;
6618 else if (!strcmp(errors, "ignore"))
6619 known_errorHandler = 3;
6620 else if (!strcmp(errors, "xmlcharrefreplace"))
6621 known_errorHandler = 4;
6622 else
6623 known_errorHandler = 0;
6624 }
6625 switch (known_errorHandler) {
6626 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006627 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 goto onError;
6629 case 2: /* replace */
6630 while (collstart++<collend)
6631 *str++ = '?'; /* fall through */
6632 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 break;
6635 case 4: /* xmlcharrefreplace */
6636 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006637 /* determine replacement size */
6638 for (i = collstart, repsize = 0; i < collend; ++i) {
6639 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6640 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006648 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006652 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006653 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006655 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 if (requiredsize > ressize) {
6659 if (requiredsize<2*ressize)
6660 requiredsize = 2*ressize;
6661 if (_PyBytes_Resize(&res, requiredsize))
6662 goto onError;
6663 str = PyBytes_AS_STRING(res) + respos;
6664 ressize = requiredsize;
6665 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 /* generate replacement */
6667 for (i = collstart; i < collend; ++i) {
6668 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 break;
6672 default:
6673 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 encoding, reason, unicode, &exc,
6675 collstart, collend, &newpos);
6676 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006677 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006679 if (PyBytes_Check(repunicode)) {
6680 /* Directly copy bytes result to output. */
6681 repsize = PyBytes_Size(repunicode);
6682 if (repsize > 1) {
6683 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006684 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006685 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6686 Py_DECREF(repunicode);
6687 goto onError;
6688 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006689 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006690 ressize += repsize-1;
6691 }
6692 memcpy(str, PyBytes_AsString(repunicode), repsize);
6693 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006694 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006695 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006696 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006697 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 /* need more space? (at least enough for what we
6699 have+the replacement+the rest of the string, so
6700 we won't have to check space for encodable characters) */
6701 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 repsize = PyUnicode_GET_LENGTH(repunicode);
6703 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 if (requiredsize > ressize) {
6705 if (requiredsize<2*ressize)
6706 requiredsize = 2*ressize;
6707 if (_PyBytes_Resize(&res, requiredsize)) {
6708 Py_DECREF(repunicode);
6709 goto onError;
6710 }
6711 str = PyBytes_AS_STRING(res) + respos;
6712 ressize = requiredsize;
6713 }
6714 /* check if there is anything unencodable in the replacement
6715 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006716 for (i = 0; repsize-->0; ++i, ++str) {
6717 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006719 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 Py_DECREF(repunicode);
6722 goto onError;
6723 }
6724 *str = (char)c;
6725 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006727 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006728 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006729 }
6730 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006731 /* Resize if we allocated to much */
6732 size = str - PyBytes_AS_STRING(res);
6733 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006734 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006735 if (_PyBytes_Resize(&res, size) < 0)
6736 goto onError;
6737 }
6738
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 Py_XDECREF(errorHandler);
6740 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006741 return res;
6742
6743 onError:
6744 Py_XDECREF(res);
6745 Py_XDECREF(errorHandler);
6746 Py_XDECREF(exc);
6747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748}
6749
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006750/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006751PyObject *
6752PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006753 Py_ssize_t size,
6754 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 PyObject *result;
6757 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6758 if (unicode == NULL)
6759 return NULL;
6760 result = unicode_encode_ucs1(unicode, errors, 256);
6761 Py_DECREF(unicode);
6762 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763}
6764
Alexander Belopolsky40018472011-02-26 01:02:56 +00006765PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006766_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767{
6768 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 PyErr_BadArgument();
6770 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006772 if (PyUnicode_READY(unicode) == -1)
6773 return NULL;
6774 /* Fast path: if it is a one-byte string, construct
6775 bytes object directly. */
6776 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6777 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6778 PyUnicode_GET_LENGTH(unicode));
6779 /* Non-Latin-1 characters present. Defer to above function to
6780 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006782}
6783
6784PyObject*
6785PyUnicode_AsLatin1String(PyObject *unicode)
6786{
6787 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788}
6789
6790/* --- 7-bit ASCII Codec -------------------------------------------------- */
6791
Alexander Belopolsky40018472011-02-26 01:02:56 +00006792PyObject *
6793PyUnicode_DecodeASCII(const char *s,
6794 Py_ssize_t size,
6795 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006797 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006798 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006799 int kind;
6800 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006801 Py_ssize_t startinpos;
6802 Py_ssize_t endinpos;
6803 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006805 int has_error;
6806 const unsigned char *p = (const unsigned char *)s;
6807 const unsigned char *end = p + size;
6808 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006809 PyObject *errorHandler = NULL;
6810 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006811
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006812 if (size == 0) {
6813 Py_INCREF(unicode_empty);
6814 return unicode_empty;
6815 }
6816
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006818 if (size == 1 && (unsigned char)s[0] < 128)
6819 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006820
Victor Stinner702c7342011-10-05 13:50:52 +02006821 has_error = 0;
6822 while (p < end && !has_error) {
6823 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6824 an explanation. */
6825 if (!((size_t) p & LONG_PTR_MASK)) {
6826 /* Help register allocation */
6827 register const unsigned char *_p = p;
6828 while (_p < aligned_end) {
6829 unsigned long value = *(unsigned long *) _p;
6830 if (value & ASCII_CHAR_MASK) {
6831 has_error = 1;
6832 break;
6833 }
6834 _p += SIZEOF_LONG;
6835 }
6836 if (_p == end)
6837 break;
6838 if (has_error)
6839 break;
6840 p = _p;
6841 }
6842 if (*p & 0x80) {
6843 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006844 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006845 }
6846 else {
6847 ++p;
6848 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006849 }
Victor Stinner702c7342011-10-05 13:50:52 +02006850 if (!has_error)
6851 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006853 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006857 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006858 kind = PyUnicode_KIND(v);
6859 data = PyUnicode_DATA(v);
6860 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861 e = s + size;
6862 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 register unsigned char c = (unsigned char)*s;
6864 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006865 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 ++s;
6867 }
6868 else {
6869 startinpos = s-starts;
6870 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 if (unicode_decode_call_errorhandler(
6872 errors, &errorHandler,
6873 "ascii", "ordinal not in range(128)",
6874 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006875 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006877 kind = PyUnicode_KIND(v);
6878 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006881 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006883 Py_XDECREF(errorHandler);
6884 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006885 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006886 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006887
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890 Py_XDECREF(errorHandler);
6891 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 return NULL;
6893}
6894
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006895/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006896PyObject *
6897PyUnicode_EncodeASCII(const Py_UNICODE *p,
6898 Py_ssize_t size,
6899 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006901 PyObject *result;
6902 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6903 if (unicode == NULL)
6904 return NULL;
6905 result = unicode_encode_ucs1(unicode, errors, 128);
6906 Py_DECREF(unicode);
6907 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908}
6909
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006911_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
6913 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 PyErr_BadArgument();
6915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006917 if (PyUnicode_READY(unicode) == -1)
6918 return NULL;
6919 /* Fast path: if it is an ASCII-only string, construct bytes object
6920 directly. Else defer to above function to raise the exception. */
6921 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6922 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6923 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006924 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006925}
6926
6927PyObject *
6928PyUnicode_AsASCIIString(PyObject *unicode)
6929{
6930 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931}
6932
Victor Stinner99b95382011-07-04 14:23:54 +02006933#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006934
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006935/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006936
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006937#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938#define NEED_RETRY
6939#endif
6940
Victor Stinner3a50e702011-10-18 21:21:00 +02006941#ifndef WC_ERR_INVALID_CHARS
6942# define WC_ERR_INVALID_CHARS 0x0080
6943#endif
6944
6945static char*
6946code_page_name(UINT code_page, PyObject **obj)
6947{
6948 *obj = NULL;
6949 if (code_page == CP_ACP)
6950 return "mbcs";
6951 if (code_page == CP_UTF7)
6952 return "CP_UTF7";
6953 if (code_page == CP_UTF8)
6954 return "CP_UTF8";
6955
6956 *obj = PyBytes_FromFormat("cp%u", code_page);
6957 if (*obj == NULL)
6958 return NULL;
6959 return PyBytes_AS_STRING(*obj);
6960}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961
Alexander Belopolsky40018472011-02-26 01:02:56 +00006962static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006963is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964{
6965 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006966 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006967
Victor Stinner3a50e702011-10-18 21:21:00 +02006968 if (!IsDBCSLeadByteEx(code_page, *curr))
6969 return 0;
6970
6971 prev = CharPrevExA(code_page, s, curr, 0);
6972 if (prev == curr)
6973 return 1;
6974 /* FIXME: This code is limited to "true" double-byte encodings,
6975 as it assumes an incomplete character consists of a single
6976 byte. */
6977 if (curr - prev == 2)
6978 return 1;
6979 if (!IsDBCSLeadByteEx(code_page, *prev))
6980 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006981 return 0;
6982}
6983
Victor Stinner3a50e702011-10-18 21:21:00 +02006984static DWORD
6985decode_code_page_flags(UINT code_page)
6986{
6987 if (code_page == CP_UTF7) {
6988 /* The CP_UTF7 decoder only supports flags=0 */
6989 return 0;
6990 }
6991 else
6992 return MB_ERR_INVALID_CHARS;
6993}
6994
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006995/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 * Decode a byte string from a Windows code page into unicode object in strict
6997 * mode.
6998 *
6999 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7000 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007002static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007003decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007004 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 const char *in,
7006 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007{
Victor Stinner3a50e702011-10-18 21:21:00 +02007008 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007009 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007010 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007011
7012 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007013 assert(insize > 0);
7014 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7015 if (outsize <= 0)
7016 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007017
7018 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007020 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007021 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 if (*v == NULL)
7023 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007024 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007025 }
7026 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007027 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007029 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007031 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032 }
7033
7034 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007035 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7036 if (outsize <= 0)
7037 goto error;
7038 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007039
Victor Stinner3a50e702011-10-18 21:21:00 +02007040error:
7041 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7042 return -2;
7043 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007044 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045}
7046
Victor Stinner3a50e702011-10-18 21:21:00 +02007047/*
7048 * Decode a byte string from a code page into unicode object with an error
7049 * handler.
7050 *
7051 * Returns consumed size if succeed, or raise a WindowsError or
7052 * UnicodeDecodeError exception and returns -1 on error.
7053 */
7054static int
7055decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007056 PyObject **v,
7057 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 const char *errors)
7059{
7060 const char *startin = in;
7061 const char *endin = in + size;
7062 const DWORD flags = decode_code_page_flags(code_page);
7063 /* Ideally, we should get reason from FormatMessage. This is the Windows
7064 2000 English version of the message. */
7065 const char *reason = "No mapping for the Unicode character exists "
7066 "in the target code page.";
7067 /* each step cannot decode more than 1 character, but a character can be
7068 represented as a surrogate pair */
7069 wchar_t buffer[2], *startout, *out;
7070 int insize, outsize;
7071 PyObject *errorHandler = NULL;
7072 PyObject *exc = NULL;
7073 PyObject *encoding_obj = NULL;
7074 char *encoding;
7075 DWORD err;
7076 int ret = -1;
7077
7078 assert(size > 0);
7079
7080 encoding = code_page_name(code_page, &encoding_obj);
7081 if (encoding == NULL)
7082 return -1;
7083
7084 if (errors == NULL || strcmp(errors, "strict") == 0) {
7085 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7086 UnicodeDecodeError. */
7087 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7088 if (exc != NULL) {
7089 PyCodec_StrictErrors(exc);
7090 Py_CLEAR(exc);
7091 }
7092 goto error;
7093 }
7094
7095 if (*v == NULL) {
7096 /* Create unicode object */
7097 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7098 PyErr_NoMemory();
7099 goto error;
7100 }
Victor Stinnerab595942011-12-17 04:59:06 +01007101 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007102 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 if (*v == NULL)
7104 goto error;
7105 startout = PyUnicode_AS_UNICODE(*v);
7106 }
7107 else {
7108 /* Extend unicode object */
7109 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7110 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7111 PyErr_NoMemory();
7112 goto error;
7113 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007114 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 goto error;
7116 startout = PyUnicode_AS_UNICODE(*v) + n;
7117 }
7118
7119 /* Decode the byte string character per character */
7120 out = startout;
7121 while (in < endin)
7122 {
7123 /* Decode a character */
7124 insize = 1;
7125 do
7126 {
7127 outsize = MultiByteToWideChar(code_page, flags,
7128 in, insize,
7129 buffer, Py_ARRAY_LENGTH(buffer));
7130 if (outsize > 0)
7131 break;
7132 err = GetLastError();
7133 if (err != ERROR_NO_UNICODE_TRANSLATION
7134 && err != ERROR_INSUFFICIENT_BUFFER)
7135 {
7136 PyErr_SetFromWindowsErr(0);
7137 goto error;
7138 }
7139 insize++;
7140 }
7141 /* 4=maximum length of a UTF-8 sequence */
7142 while (insize <= 4 && (in + insize) <= endin);
7143
7144 if (outsize <= 0) {
7145 Py_ssize_t startinpos, endinpos, outpos;
7146
7147 startinpos = in - startin;
7148 endinpos = startinpos + 1;
7149 outpos = out - PyUnicode_AS_UNICODE(*v);
7150 if (unicode_decode_call_errorhandler(
7151 errors, &errorHandler,
7152 encoding, reason,
7153 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007154 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 {
7156 goto error;
7157 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007158 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 }
7160 else {
7161 in += insize;
7162 memcpy(out, buffer, outsize * sizeof(wchar_t));
7163 out += outsize;
7164 }
7165 }
7166
7167 /* write a NUL character at the end */
7168 *out = 0;
7169
7170 /* Extend unicode object */
7171 outsize = out - startout;
7172 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007173 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007175 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007176
7177error:
7178 Py_XDECREF(encoding_obj);
7179 Py_XDECREF(errorHandler);
7180 Py_XDECREF(exc);
7181 return ret;
7182}
7183
Victor Stinner3a50e702011-10-18 21:21:00 +02007184static PyObject *
7185decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007186 const char *s, Py_ssize_t size,
7187 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007188{
Victor Stinner76a31a62011-11-04 00:05:13 +01007189 PyObject *v = NULL;
7190 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 if (code_page < 0) {
7193 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7194 return NULL;
7195 }
7196
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199
Victor Stinner76a31a62011-11-04 00:05:13 +01007200 do
7201 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007203 if (size > INT_MAX) {
7204 chunk_size = INT_MAX;
7205 final = 0;
7206 done = 0;
7207 }
7208 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007210 {
7211 chunk_size = (int)size;
7212 final = (consumed == NULL);
7213 done = 1;
7214 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007215
Victor Stinner76a31a62011-11-04 00:05:13 +01007216 /* Skip trailing lead-byte unless 'final' is set */
7217 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7218 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007219
Victor Stinner76a31a62011-11-04 00:05:13 +01007220 if (chunk_size == 0 && done) {
7221 if (v != NULL)
7222 break;
7223 Py_INCREF(unicode_empty);
7224 return unicode_empty;
7225 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007226
Victor Stinner76a31a62011-11-04 00:05:13 +01007227
7228 converted = decode_code_page_strict(code_page, &v,
7229 s, chunk_size);
7230 if (converted == -2)
7231 converted = decode_code_page_errors(code_page, &v,
7232 s, chunk_size,
7233 errors);
7234 assert(converted != 0);
7235
7236 if (converted < 0) {
7237 Py_XDECREF(v);
7238 return NULL;
7239 }
7240
7241 if (consumed)
7242 *consumed += converted;
7243
7244 s += converted;
7245 size -= converted;
7246 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007247
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007248 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007249}
7250
Alexander Belopolsky40018472011-02-26 01:02:56 +00007251PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007252PyUnicode_DecodeCodePageStateful(int code_page,
7253 const char *s,
7254 Py_ssize_t size,
7255 const char *errors,
7256 Py_ssize_t *consumed)
7257{
7258 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7259}
7260
7261PyObject *
7262PyUnicode_DecodeMBCSStateful(const char *s,
7263 Py_ssize_t size,
7264 const char *errors,
7265 Py_ssize_t *consumed)
7266{
7267 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7268}
7269
7270PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007271PyUnicode_DecodeMBCS(const char *s,
7272 Py_ssize_t size,
7273 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007274{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7276}
7277
Victor Stinner3a50e702011-10-18 21:21:00 +02007278static DWORD
7279encode_code_page_flags(UINT code_page, const char *errors)
7280{
7281 if (code_page == CP_UTF8) {
7282 if (winver.dwMajorVersion >= 6)
7283 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7284 and later */
7285 return WC_ERR_INVALID_CHARS;
7286 else
7287 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7288 return 0;
7289 }
7290 else if (code_page == CP_UTF7) {
7291 /* CP_UTF7 only supports flags=0 */
7292 return 0;
7293 }
7294 else {
7295 if (errors != NULL && strcmp(errors, "replace") == 0)
7296 return 0;
7297 else
7298 return WC_NO_BEST_FIT_CHARS;
7299 }
7300}
7301
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 * Encode a Unicode string to a Windows code page into a byte string in strict
7304 * mode.
7305 *
7306 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7307 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007309static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007310encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007311 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313{
Victor Stinner554f3f02010-06-16 23:33:54 +00007314 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 BOOL *pusedDefaultChar = &usedDefaultChar;
7316 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007317 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007318 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007319 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007320 const DWORD flags = encode_code_page_flags(code_page, NULL);
7321 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007322 /* Create a substring so that we can get the UTF-16 representation
7323 of just the slice under consideration. */
7324 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325
Martin v. Löwis3d325192011-11-04 18:23:06 +01007326 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007327
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007329 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007331 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007332
Victor Stinner2fc507f2011-11-04 20:06:39 +01007333 substring = PyUnicode_Substring(unicode, offset, offset+len);
7334 if (substring == NULL)
7335 return -1;
7336 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7337 if (p == NULL) {
7338 Py_DECREF(substring);
7339 return -1;
7340 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007341
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007342 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 outsize = WideCharToMultiByte(code_page, flags,
7344 p, size,
7345 NULL, 0,
7346 NULL, pusedDefaultChar);
7347 if (outsize <= 0)
7348 goto error;
7349 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007350 if (pusedDefaultChar && *pusedDefaultChar) {
7351 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007353 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007354
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007358 if (*outbytes == NULL) {
7359 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007361 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007362 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363 }
7364 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 const Py_ssize_t n = PyBytes_Size(*outbytes);
7367 if (outsize > PY_SSIZE_T_MAX - n) {
7368 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007369 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007372 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7373 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007375 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377 }
7378
7379 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 outsize = WideCharToMultiByte(code_page, flags,
7381 p, size,
7382 out, outsize,
7383 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007384 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 if (outsize <= 0)
7386 goto error;
7387 if (pusedDefaultChar && *pusedDefaultChar)
7388 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007389 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007390
Victor Stinner3a50e702011-10-18 21:21:00 +02007391error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007392 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7394 return -2;
7395 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007396 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007397}
7398
Victor Stinner3a50e702011-10-18 21:21:00 +02007399/*
7400 * Encode a Unicode string to a Windows code page into a byte string using a
7401 * error handler.
7402 *
7403 * Returns consumed characters if succeed, or raise a WindowsError and returns
7404 * -1 on other error.
7405 */
7406static int
7407encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007408 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007409 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007410{
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007412 Py_ssize_t pos = unicode_offset;
7413 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 /* Ideally, we should get reason from FormatMessage. This is the Windows
7415 2000 English version of the message. */
7416 const char *reason = "invalid character";
7417 /* 4=maximum length of a UTF-8 sequence */
7418 char buffer[4];
7419 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7420 Py_ssize_t outsize;
7421 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 PyObject *errorHandler = NULL;
7423 PyObject *exc = NULL;
7424 PyObject *encoding_obj = NULL;
7425 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007426 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 PyObject *rep;
7428 int ret = -1;
7429
7430 assert(insize > 0);
7431
7432 encoding = code_page_name(code_page, &encoding_obj);
7433 if (encoding == NULL)
7434 return -1;
7435
7436 if (errors == NULL || strcmp(errors, "strict") == 0) {
7437 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7438 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007439 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 if (exc != NULL) {
7441 PyCodec_StrictErrors(exc);
7442 Py_DECREF(exc);
7443 }
7444 Py_XDECREF(encoding_obj);
7445 return -1;
7446 }
7447
7448 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7449 pusedDefaultChar = &usedDefaultChar;
7450 else
7451 pusedDefaultChar = NULL;
7452
7453 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7454 PyErr_NoMemory();
7455 goto error;
7456 }
7457 outsize = insize * Py_ARRAY_LENGTH(buffer);
7458
7459 if (*outbytes == NULL) {
7460 /* Create string object */
7461 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7462 if (*outbytes == NULL)
7463 goto error;
7464 out = PyBytes_AS_STRING(*outbytes);
7465 }
7466 else {
7467 /* Extend string object */
7468 Py_ssize_t n = PyBytes_Size(*outbytes);
7469 if (n > PY_SSIZE_T_MAX - outsize) {
7470 PyErr_NoMemory();
7471 goto error;
7472 }
7473 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7474 goto error;
7475 out = PyBytes_AS_STRING(*outbytes) + n;
7476 }
7477
7478 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007479 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7482 wchar_t chars[2];
7483 int charsize;
7484 if (ch < 0x10000) {
7485 chars[0] = (wchar_t)ch;
7486 charsize = 1;
7487 }
7488 else {
7489 ch -= 0x10000;
7490 chars[0] = 0xd800 + (ch >> 10);
7491 chars[1] = 0xdc00 + (ch & 0x3ff);
7492 charsize = 2;
7493 }
7494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 buffer, Py_ARRAY_LENGTH(buffer),
7498 NULL, pusedDefaultChar);
7499 if (outsize > 0) {
7500 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7501 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007502 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 memcpy(out, buffer, outsize);
7504 out += outsize;
7505 continue;
7506 }
7507 }
7508 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7509 PyErr_SetFromWindowsErr(0);
7510 goto error;
7511 }
7512
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 rep = unicode_encode_call_errorhandler(
7514 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007515 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007516 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007517 if (rep == NULL)
7518 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007519 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007520
7521 if (PyBytes_Check(rep)) {
7522 outsize = PyBytes_GET_SIZE(rep);
7523 if (outsize != 1) {
7524 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7525 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7526 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7527 Py_DECREF(rep);
7528 goto error;
7529 }
7530 out = PyBytes_AS_STRING(*outbytes) + offset;
7531 }
7532 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7533 out += outsize;
7534 }
7535 else {
7536 Py_ssize_t i;
7537 enum PyUnicode_Kind kind;
7538 void *data;
7539
Benjamin Petersonbac79492012-01-14 13:34:47 -05007540 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 Py_DECREF(rep);
7542 goto error;
7543 }
7544
7545 outsize = PyUnicode_GET_LENGTH(rep);
7546 if (outsize != 1) {
7547 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7548 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7549 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7550 Py_DECREF(rep);
7551 goto error;
7552 }
7553 out = PyBytes_AS_STRING(*outbytes) + offset;
7554 }
7555 kind = PyUnicode_KIND(rep);
7556 data = PyUnicode_DATA(rep);
7557 for (i=0; i < outsize; i++) {
7558 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7559 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007560 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007561 encoding, unicode,
7562 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 "unable to encode error handler result to ASCII");
7564 Py_DECREF(rep);
7565 goto error;
7566 }
7567 *out = (unsigned char)ch;
7568 out++;
7569 }
7570 }
7571 Py_DECREF(rep);
7572 }
7573 /* write a NUL byte */
7574 *out = 0;
7575 outsize = out - PyBytes_AS_STRING(*outbytes);
7576 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7577 if (_PyBytes_Resize(outbytes, outsize) < 0)
7578 goto error;
7579 ret = 0;
7580
7581error:
7582 Py_XDECREF(encoding_obj);
7583 Py_XDECREF(errorHandler);
7584 Py_XDECREF(exc);
7585 return ret;
7586}
7587
Victor Stinner3a50e702011-10-18 21:21:00 +02007588static PyObject *
7589encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007590 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 const char *errors)
7592{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007593 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007595 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007596 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007597
Benjamin Petersonbac79492012-01-14 13:34:47 -05007598 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007599 return NULL;
7600 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007601
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 if (code_page < 0) {
7603 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7604 return NULL;
7605 }
7606
Martin v. Löwis3d325192011-11-04 18:23:06 +01007607 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007608 return PyBytes_FromStringAndSize(NULL, 0);
7609
Victor Stinner7581cef2011-11-03 22:32:33 +01007610 offset = 0;
7611 do
7612 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007613#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007614 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007615 chunks. */
7616 if (len > INT_MAX/2) {
7617 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007618 done = 0;
7619 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007620 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007621#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007622 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007623 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007624 done = 1;
7625 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007626
Victor Stinner76a31a62011-11-04 00:05:13 +01007627 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007628 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007629 errors);
7630 if (ret == -2)
7631 ret = encode_code_page_errors(code_page, &outbytes,
7632 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007633 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007634 if (ret < 0) {
7635 Py_XDECREF(outbytes);
7636 return NULL;
7637 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007638
Victor Stinner7581cef2011-11-03 22:32:33 +01007639 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007640 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007641 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007642
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 return outbytes;
7644}
7645
7646PyObject *
7647PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7648 Py_ssize_t size,
7649 const char *errors)
7650{
Victor Stinner7581cef2011-11-03 22:32:33 +01007651 PyObject *unicode, *res;
7652 unicode = PyUnicode_FromUnicode(p, size);
7653 if (unicode == NULL)
7654 return NULL;
7655 res = encode_code_page(CP_ACP, unicode, errors);
7656 Py_DECREF(unicode);
7657 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007658}
7659
7660PyObject *
7661PyUnicode_EncodeCodePage(int code_page,
7662 PyObject *unicode,
7663 const char *errors)
7664{
Victor Stinner7581cef2011-11-03 22:32:33 +01007665 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007666}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007667
Alexander Belopolsky40018472011-02-26 01:02:56 +00007668PyObject *
7669PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007670{
7671 if (!PyUnicode_Check(unicode)) {
7672 PyErr_BadArgument();
7673 return NULL;
7674 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007675 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007676}
7677
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007678#undef NEED_RETRY
7679
Victor Stinner99b95382011-07-04 14:23:54 +02007680#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007681
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682/* --- Character Mapping Codec -------------------------------------------- */
7683
Alexander Belopolsky40018472011-02-26 01:02:56 +00007684PyObject *
7685PyUnicode_DecodeCharmap(const char *s,
7686 Py_ssize_t size,
7687 PyObject *mapping,
7688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007690 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007691 Py_ssize_t startinpos;
7692 Py_ssize_t endinpos;
7693 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007695 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007696 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697 PyObject *errorHandler = NULL;
7698 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007699
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 /* Default to Latin-1 */
7701 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007704 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007708 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007709 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007711 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007712 Py_ssize_t maplen;
7713 enum PyUnicode_Kind kind;
7714 void *data;
7715 Py_UCS4 x;
7716
Benjamin Petersonbac79492012-01-14 13:34:47 -05007717 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007718 return NULL;
7719
7720 maplen = PyUnicode_GET_LENGTH(mapping);
7721 data = PyUnicode_DATA(mapping);
7722 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 while (s < e) {
7724 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007727 x = PyUnicode_READ(kind, data, ch);
7728 else
7729 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007731 if (x == 0xfffe)
7732 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 startinpos = s-starts;
7735 endinpos = startinpos+1;
7736 if (unicode_decode_call_errorhandler(
7737 errors, &errorHandler,
7738 "charmap", "character maps to <undefined>",
7739 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007740 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 goto onError;
7742 }
7743 continue;
7744 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007745
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007746 if (unicode_putchar(&v, &outpos, x) < 0)
7747 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007749 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007750 }
7751 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 while (s < e) {
7753 unsigned char ch = *s;
7754 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007755
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7757 w = PyLong_FromLong((long)ch);
7758 if (w == NULL)
7759 goto onError;
7760 x = PyObject_GetItem(mapping, w);
7761 Py_DECREF(w);
7762 if (x == NULL) {
7763 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7764 /* No mapping found means: mapping is undefined. */
7765 PyErr_Clear();
7766 x = Py_None;
7767 Py_INCREF(x);
7768 } else
7769 goto onError;
7770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 /* Apply mapping */
7773 if (PyLong_Check(x)) {
7774 long value = PyLong_AS_LONG(x);
7775 if (value < 0 || value > 65535) {
7776 PyErr_SetString(PyExc_TypeError,
7777 "character mapping must be in range(65536)");
7778 Py_DECREF(x);
7779 goto onError;
7780 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007781 if (unicode_putchar(&v, &outpos, value) < 0)
7782 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 }
7784 else if (x == Py_None) {
7785 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 startinpos = s-starts;
7787 endinpos = startinpos+1;
7788 if (unicode_decode_call_errorhandler(
7789 errors, &errorHandler,
7790 "charmap", "character maps to <undefined>",
7791 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007792 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 Py_DECREF(x);
7794 goto onError;
7795 }
7796 Py_DECREF(x);
7797 continue;
7798 }
7799 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007800 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007801
Benjamin Petersonbac79492012-01-14 13:34:47 -05007802 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007803 goto onError;
7804 targetsize = PyUnicode_GET_LENGTH(x);
7805
7806 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007808 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007809 PyUnicode_READ_CHAR(x, 0)) < 0)
7810 goto onError;
7811 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 else if (targetsize > 1) {
7813 /* 1-n mapping */
7814 if (targetsize > extrachars) {
7815 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 Py_ssize_t needed = (targetsize - extrachars) + \
7817 (targetsize << 2);
7818 extrachars += needed;
7819 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007820 if (unicode_resize(&v,
7821 PyUnicode_GET_LENGTH(v) + needed) < 0)
7822 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 Py_DECREF(x);
7824 goto onError;
7825 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007827 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7828 goto onError;
7829 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7830 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 extrachars -= targetsize;
7832 }
7833 /* 1-0 mapping: skip the character */
7834 }
7835 else {
7836 /* wrong return value */
7837 PyErr_SetString(PyExc_TypeError,
7838 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007839 Py_DECREF(x);
7840 goto onError;
7841 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 Py_DECREF(x);
7843 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007846 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007847 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007848 Py_XDECREF(errorHandler);
7849 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007850 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007851
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007853 Py_XDECREF(errorHandler);
7854 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 Py_XDECREF(v);
7856 return NULL;
7857}
7858
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007859/* Charmap encoding: the lookup table */
7860
Alexander Belopolsky40018472011-02-26 01:02:56 +00007861struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 PyObject_HEAD
7863 unsigned char level1[32];
7864 int count2, count3;
7865 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007866};
7867
7868static PyObject*
7869encoding_map_size(PyObject *obj, PyObject* args)
7870{
7871 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007872 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007874}
7875
7876static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007877 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 PyDoc_STR("Return the size (in bytes) of this object") },
7879 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880};
7881
7882static void
7883encoding_map_dealloc(PyObject* o)
7884{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007885 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886}
7887
7888static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007889 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 "EncodingMap", /*tp_name*/
7891 sizeof(struct encoding_map), /*tp_basicsize*/
7892 0, /*tp_itemsize*/
7893 /* methods */
7894 encoding_map_dealloc, /*tp_dealloc*/
7895 0, /*tp_print*/
7896 0, /*tp_getattr*/
7897 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007898 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 0, /*tp_repr*/
7900 0, /*tp_as_number*/
7901 0, /*tp_as_sequence*/
7902 0, /*tp_as_mapping*/
7903 0, /*tp_hash*/
7904 0, /*tp_call*/
7905 0, /*tp_str*/
7906 0, /*tp_getattro*/
7907 0, /*tp_setattro*/
7908 0, /*tp_as_buffer*/
7909 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7910 0, /*tp_doc*/
7911 0, /*tp_traverse*/
7912 0, /*tp_clear*/
7913 0, /*tp_richcompare*/
7914 0, /*tp_weaklistoffset*/
7915 0, /*tp_iter*/
7916 0, /*tp_iternext*/
7917 encoding_map_methods, /*tp_methods*/
7918 0, /*tp_members*/
7919 0, /*tp_getset*/
7920 0, /*tp_base*/
7921 0, /*tp_dict*/
7922 0, /*tp_descr_get*/
7923 0, /*tp_descr_set*/
7924 0, /*tp_dictoffset*/
7925 0, /*tp_init*/
7926 0, /*tp_alloc*/
7927 0, /*tp_new*/
7928 0, /*tp_free*/
7929 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007930};
7931
7932PyObject*
7933PyUnicode_BuildEncodingMap(PyObject* string)
7934{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007935 PyObject *result;
7936 struct encoding_map *mresult;
7937 int i;
7938 int need_dict = 0;
7939 unsigned char level1[32];
7940 unsigned char level2[512];
7941 unsigned char *mlevel1, *mlevel2, *mlevel3;
7942 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007943 int kind;
7944 void *data;
7945 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007947 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007948 PyErr_BadArgument();
7949 return NULL;
7950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007951 kind = PyUnicode_KIND(string);
7952 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007953 memset(level1, 0xFF, sizeof level1);
7954 memset(level2, 0xFF, sizeof level2);
7955
7956 /* If there isn't a one-to-one mapping of NULL to \0,
7957 or if there are non-BMP characters, we need to use
7958 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007959 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007960 need_dict = 1;
7961 for (i = 1; i < 256; i++) {
7962 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 ch = PyUnicode_READ(kind, data, i);
7964 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007965 need_dict = 1;
7966 break;
7967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007968 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007969 /* unmapped character */
7970 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 l1 = ch >> 11;
7972 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973 if (level1[l1] == 0xFF)
7974 level1[l1] = count2++;
7975 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007976 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007977 }
7978
7979 if (count2 >= 0xFF || count3 >= 0xFF)
7980 need_dict = 1;
7981
7982 if (need_dict) {
7983 PyObject *result = PyDict_New();
7984 PyObject *key, *value;
7985 if (!result)
7986 return NULL;
7987 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007989 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007990 if (!key || !value)
7991 goto failed1;
7992 if (PyDict_SetItem(result, key, value) == -1)
7993 goto failed1;
7994 Py_DECREF(key);
7995 Py_DECREF(value);
7996 }
7997 return result;
7998 failed1:
7999 Py_XDECREF(key);
8000 Py_XDECREF(value);
8001 Py_DECREF(result);
8002 return NULL;
8003 }
8004
8005 /* Create a three-level trie */
8006 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8007 16*count2 + 128*count3 - 1);
8008 if (!result)
8009 return PyErr_NoMemory();
8010 PyObject_Init(result, &EncodingMapType);
8011 mresult = (struct encoding_map*)result;
8012 mresult->count2 = count2;
8013 mresult->count3 = count3;
8014 mlevel1 = mresult->level1;
8015 mlevel2 = mresult->level23;
8016 mlevel3 = mresult->level23 + 16*count2;
8017 memcpy(mlevel1, level1, 32);
8018 memset(mlevel2, 0xFF, 16*count2);
8019 memset(mlevel3, 0, 128*count3);
8020 count3 = 0;
8021 for (i = 1; i < 256; i++) {
8022 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008023 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024 /* unmapped character */
8025 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 o1 = PyUnicode_READ(kind, data, i)>>11;
8027 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028 i2 = 16*mlevel1[o1] + o2;
8029 if (mlevel2[i2] == 0xFF)
8030 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008031 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008032 i3 = 128*mlevel2[i2] + o3;
8033 mlevel3[i3] = i;
8034 }
8035 return result;
8036}
8037
8038static int
Victor Stinner22168992011-11-20 17:09:18 +01008039encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008040{
8041 struct encoding_map *map = (struct encoding_map*)mapping;
8042 int l1 = c>>11;
8043 int l2 = (c>>7) & 0xF;
8044 int l3 = c & 0x7F;
8045 int i;
8046
Victor Stinner22168992011-11-20 17:09:18 +01008047 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049 if (c == 0)
8050 return 0;
8051 /* level 1*/
8052 i = map->level1[l1];
8053 if (i == 0xFF) {
8054 return -1;
8055 }
8056 /* level 2*/
8057 i = map->level23[16*i+l2];
8058 if (i == 0xFF) {
8059 return -1;
8060 }
8061 /* level 3 */
8062 i = map->level23[16*map->count2 + 128*i + l3];
8063 if (i == 0) {
8064 return -1;
8065 }
8066 return i;
8067}
8068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008069/* Lookup the character ch in the mapping. If the character
8070 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008071 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008073charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074{
Christian Heimes217cfd12007-12-02 14:31:20 +00008075 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008076 PyObject *x;
8077
8078 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 x = PyObject_GetItem(mapping, w);
8081 Py_DECREF(w);
8082 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8084 /* No mapping found means: mapping is undefined. */
8085 PyErr_Clear();
8086 x = Py_None;
8087 Py_INCREF(x);
8088 return x;
8089 } else
8090 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008092 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008094 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 long value = PyLong_AS_LONG(x);
8096 if (value < 0 || value > 255) {
8097 PyErr_SetString(PyExc_TypeError,
8098 "character mapping must be in range(256)");
8099 Py_DECREF(x);
8100 return NULL;
8101 }
8102 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008104 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 /* wrong return value */
8108 PyErr_Format(PyExc_TypeError,
8109 "character mapping must return integer, bytes or None, not %.400s",
8110 x->ob_type->tp_name);
8111 Py_DECREF(x);
8112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 }
8114}
8115
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008117charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008118{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008119 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8120 /* exponentially overallocate to minimize reallocations */
8121 if (requiredsize < 2*outsize)
8122 requiredsize = 2*outsize;
8123 if (_PyBytes_Resize(outobj, requiredsize))
8124 return -1;
8125 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126}
8127
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008130} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008132 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 space is available. Return a new reference to the object that
8134 was put in the output buffer, or Py_None, if the mapping was undefined
8135 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008136 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008137static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008138charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008139 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 PyObject *rep;
8142 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008143 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144
Christian Heimes90aa7642007-12-19 02:45:37 +00008145 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 if (res == -1)
8149 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 if (outsize<requiredsize)
8151 if (charmapencode_resize(outobj, outpos, requiredsize))
8152 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008153 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 outstart[(*outpos)++] = (char)res;
8155 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 }
8157
8158 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 Py_DECREF(rep);
8163 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008164 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 if (PyLong_Check(rep)) {
8166 Py_ssize_t requiredsize = *outpos+1;
8167 if (outsize<requiredsize)
8168 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8169 Py_DECREF(rep);
8170 return enc_EXCEPTION;
8171 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008172 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008174 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 else {
8176 const char *repchars = PyBytes_AS_STRING(rep);
8177 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8178 Py_ssize_t requiredsize = *outpos+repsize;
8179 if (outsize<requiredsize)
8180 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8181 Py_DECREF(rep);
8182 return enc_EXCEPTION;
8183 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008184 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 memcpy(outstart + *outpos, repchars, repsize);
8186 *outpos += repsize;
8187 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008188 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189 Py_DECREF(rep);
8190 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191}
8192
8193/* handle an error in PyUnicode_EncodeCharmap
8194 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008195static int
8196charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008197 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008199 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008200 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008201{
8202 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008203 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008204 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008205 enum PyUnicode_Kind kind;
8206 void *data;
8207 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008208 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008209 Py_ssize_t collstartpos = *inpos;
8210 Py_ssize_t collendpos = *inpos+1;
8211 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 char *encoding = "charmap";
8213 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008215 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008216 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008217
Benjamin Petersonbac79492012-01-14 13:34:47 -05008218 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008219 return -1;
8220 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221 /* find all unencodable characters */
8222 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008224 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008225 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008226 val = encoding_map_lookup(ch, mapping);
8227 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 break;
8229 ++collendpos;
8230 continue;
8231 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008233 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8234 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 if (rep==NULL)
8236 return -1;
8237 else if (rep!=Py_None) {
8238 Py_DECREF(rep);
8239 break;
8240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 }
8244 /* cache callback name lookup
8245 * (if not done yet, i.e. it's the first error) */
8246 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 if ((errors==NULL) || (!strcmp(errors, "strict")))
8248 *known_errorHandler = 1;
8249 else if (!strcmp(errors, "replace"))
8250 *known_errorHandler = 2;
8251 else if (!strcmp(errors, "ignore"))
8252 *known_errorHandler = 3;
8253 else if (!strcmp(errors, "xmlcharrefreplace"))
8254 *known_errorHandler = 4;
8255 else
8256 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 }
8258 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008259 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008260 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 return -1;
8262 case 2: /* replace */
8263 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 x = charmapencode_output('?', mapping, res, respos);
8265 if (x==enc_EXCEPTION) {
8266 return -1;
8267 }
8268 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008269 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 return -1;
8271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 }
8273 /* fall through */
8274 case 3: /* ignore */
8275 *inpos = collendpos;
8276 break;
8277 case 4: /* xmlcharrefreplace */
8278 /* generate replacement (temporarily (mis)uses p) */
8279 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 char buffer[2+29+1+1];
8281 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008282 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 for (cp = buffer; *cp; ++cp) {
8284 x = charmapencode_output(*cp, mapping, res, respos);
8285 if (x==enc_EXCEPTION)
8286 return -1;
8287 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008288 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return -1;
8290 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008291 }
8292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008293 *inpos = collendpos;
8294 break;
8295 default:
8296 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008297 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008299 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008301 if (PyBytes_Check(repunicode)) {
8302 /* Directly copy bytes result to output. */
8303 Py_ssize_t outsize = PyBytes_Size(*res);
8304 Py_ssize_t requiredsize;
8305 repsize = PyBytes_Size(repunicode);
8306 requiredsize = *respos + repsize;
8307 if (requiredsize > outsize)
8308 /* Make room for all additional bytes. */
8309 if (charmapencode_resize(res, respos, requiredsize)) {
8310 Py_DECREF(repunicode);
8311 return -1;
8312 }
8313 memcpy(PyBytes_AsString(*res) + *respos,
8314 PyBytes_AsString(repunicode), repsize);
8315 *respos += repsize;
8316 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008317 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008318 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008319 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008320 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008321 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008322 Py_DECREF(repunicode);
8323 return -1;
8324 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008325 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008326 data = PyUnicode_DATA(repunicode);
8327 kind = PyUnicode_KIND(repunicode);
8328 for (index = 0; index < repsize; index++) {
8329 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8330 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008332 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 return -1;
8334 }
8335 else if (x==enc_FAILED) {
8336 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008337 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 return -1;
8339 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008340 }
8341 *inpos = newpos;
8342 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 }
8344 return 0;
8345}
8346
Alexander Belopolsky40018472011-02-26 01:02:56 +00008347PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008348_PyUnicode_EncodeCharmap(PyObject *unicode,
8349 PyObject *mapping,
8350 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 /* output object */
8353 PyObject *res = NULL;
8354 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008355 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008358 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 PyObject *errorHandler = NULL;
8360 PyObject *exc = NULL;
8361 /* the following variable is used for caching string comparisons
8362 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8363 * 3=ignore, 4=xmlcharrefreplace */
8364 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365
Benjamin Petersonbac79492012-01-14 13:34:47 -05008366 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008367 return NULL;
8368 size = PyUnicode_GET_LENGTH(unicode);
8369
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 /* Default to Latin-1 */
8371 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008372 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 /* allocate enough for a simple encoding without
8375 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008376 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 if (res == NULL)
8378 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008379 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008383 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008385 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 if (x==enc_EXCEPTION) /* error */
8387 goto onError;
8388 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 &exc,
8391 &known_errorHandler, &errorHandler, errors,
8392 &res, &respos)) {
8393 goto onError;
8394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008395 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 else
8397 /* done with this character => adjust input position */
8398 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008402 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008403 if (_PyBytes_Resize(&res, respos) < 0)
8404 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 Py_XDECREF(exc);
8407 Py_XDECREF(errorHandler);
8408 return res;
8409
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 Py_XDECREF(res);
8412 Py_XDECREF(exc);
8413 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 return NULL;
8415}
8416
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008417/* Deprecated */
8418PyObject *
8419PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8420 Py_ssize_t size,
8421 PyObject *mapping,
8422 const char *errors)
8423{
8424 PyObject *result;
8425 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8426 if (unicode == NULL)
8427 return NULL;
8428 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8429 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008430 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008431}
8432
Alexander Belopolsky40018472011-02-26 01:02:56 +00008433PyObject *
8434PyUnicode_AsCharmapString(PyObject *unicode,
8435 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436{
8437 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 PyErr_BadArgument();
8439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008441 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442}
8443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008445static void
8446make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008448 Py_ssize_t startpos, Py_ssize_t endpos,
8449 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 *exceptionObject = _PyUnicodeTranslateError_Create(
8453 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454 }
8455 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8457 goto onError;
8458 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8459 goto onError;
8460 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8461 goto onError;
8462 return;
8463 onError:
8464 Py_DECREF(*exceptionObject);
8465 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 }
8467}
8468
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008470static void
8471raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008473 Py_ssize_t startpos, Py_ssize_t endpos,
8474 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475{
8476 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480}
8481
8482/* error handling callback helper:
8483 build arguments, call the callback and check the arguments,
8484 put the result into newpos and return the replacement string, which
8485 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008486static PyObject *
8487unicode_translate_call_errorhandler(const char *errors,
8488 PyObject **errorHandler,
8489 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008491 Py_ssize_t startpos, Py_ssize_t endpos,
8492 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008494 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008496 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497 PyObject *restuple;
8498 PyObject *resunicode;
8499
8500 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504 }
8505
8506 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510
8511 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008516 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 Py_DECREF(restuple);
8518 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 }
8520 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 &resunicode, &i_newpos)) {
8522 Py_DECREF(restuple);
8523 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008525 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008527 else
8528 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8531 Py_DECREF(restuple);
8532 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008533 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 Py_INCREF(resunicode);
8535 Py_DECREF(restuple);
8536 return resunicode;
8537}
8538
8539/* Lookup the character ch in the mapping and put the result in result,
8540 which must be decrefed by the caller.
8541 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008542static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544{
Christian Heimes217cfd12007-12-02 14:31:20 +00008545 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 PyObject *x;
8547
8548 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 x = PyObject_GetItem(mapping, w);
8551 Py_DECREF(w);
8552 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8554 /* No mapping found means: use 1:1 mapping. */
8555 PyErr_Clear();
8556 *result = NULL;
8557 return 0;
8558 } else
8559 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 }
8561 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 *result = x;
8563 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008565 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 long value = PyLong_AS_LONG(x);
8567 long max = PyUnicode_GetMax();
8568 if (value < 0 || value > max) {
8569 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008570 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 Py_DECREF(x);
8572 return -1;
8573 }
8574 *result = x;
8575 return 0;
8576 }
8577 else if (PyUnicode_Check(x)) {
8578 *result = x;
8579 return 0;
8580 }
8581 else {
8582 /* wrong return value */
8583 PyErr_SetString(PyExc_TypeError,
8584 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008585 Py_DECREF(x);
8586 return -1;
8587 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588}
8589/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 if not reallocate and adjust various state variables.
8591 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008592static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008597 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 /* exponentially overallocate to minimize reallocations */
8599 if (requiredsize < 2 * oldsize)
8600 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8602 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605 }
8606 return 0;
8607}
8608/* lookup the character, put the result in the output string and adjust
8609 various state variables. Return a new reference to the object that
8610 was put in the output buffer in *result, or Py_None, if the mapping was
8611 undefined (in which case no character was written).
8612 The called must decref result.
8613 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8616 PyObject *mapping, Py_UCS4 **output,
8617 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008618 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8621 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 }
8627 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008629 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632 }
8633 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 Py_ssize_t repsize;
8635 if (PyUnicode_READY(*res) == -1)
8636 return -1;
8637 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 if (repsize==1) {
8639 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 }
8642 else if (repsize!=0) {
8643 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 Py_ssize_t requiredsize = *opos +
8645 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 Py_ssize_t i;
8648 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 for(i = 0; i < repsize; i++)
8651 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 }
8654 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656 return 0;
8657}
8658
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660_PyUnicode_TranslateCharmap(PyObject *input,
8661 PyObject *mapping,
8662 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 /* input object */
8665 char *idata;
8666 Py_ssize_t size, i;
8667 int kind;
8668 /* output buffer */
8669 Py_UCS4 *output = NULL;
8670 Py_ssize_t osize;
8671 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 char *reason = "character maps to <undefined>";
8675 PyObject *errorHandler = NULL;
8676 PyObject *exc = NULL;
8677 /* the following variable is used for caching string comparisons
8678 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8679 * 3=ignore, 4=xmlcharrefreplace */
8680 int known_errorHandler = -1;
8681
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 PyErr_BadArgument();
8684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 if (PyUnicode_READY(input) == -1)
8688 return NULL;
8689 idata = (char*)PyUnicode_DATA(input);
8690 kind = PyUnicode_KIND(input);
8691 size = PyUnicode_GET_LENGTH(input);
8692 i = 0;
8693
8694 if (size == 0) {
8695 Py_INCREF(input);
8696 return input;
8697 }
8698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 /* allocate enough for a simple 1:1 translation without
8700 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 osize = size;
8702 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8703 opos = 0;
8704 if (output == NULL) {
8705 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 /* try to encode it */
8711 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 if (charmaptranslate_output(input, i, mapping,
8713 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 Py_XDECREF(x);
8715 goto onError;
8716 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008717 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 else { /* untranslatable character */
8721 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8722 Py_ssize_t repsize;
8723 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 Py_ssize_t collstart = i;
8727 Py_ssize_t collend = i+1;
8728 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 while (collend < size) {
8732 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 goto onError;
8734 Py_XDECREF(x);
8735 if (x!=Py_None)
8736 break;
8737 ++collend;
8738 }
8739 /* cache callback name lookup
8740 * (if not done yet, i.e. it's the first error) */
8741 if (known_errorHandler==-1) {
8742 if ((errors==NULL) || (!strcmp(errors, "strict")))
8743 known_errorHandler = 1;
8744 else if (!strcmp(errors, "replace"))
8745 known_errorHandler = 2;
8746 else if (!strcmp(errors, "ignore"))
8747 known_errorHandler = 3;
8748 else if (!strcmp(errors, "xmlcharrefreplace"))
8749 known_errorHandler = 4;
8750 else
8751 known_errorHandler = 0;
8752 }
8753 switch (known_errorHandler) {
8754 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 raise_translate_exception(&exc, input, collstart,
8756 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 case 2: /* replace */
8759 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760 for (coll = collstart; coll<collend; coll++)
8761 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 /* fall through */
8763 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 break;
8766 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 /* generate replacement (temporarily (mis)uses i) */
8768 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 char buffer[2+29+1+1];
8770 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8772 if (charmaptranslate_makespace(&output, &osize,
8773 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 goto onError;
8775 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 break;
8780 default:
8781 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 reason, input, &exc,
8783 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008784 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008786 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008787 Py_DECREF(repunicode);
8788 goto onError;
8789 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 repsize = PyUnicode_GET_LENGTH(repunicode);
8792 if (charmaptranslate_makespace(&output, &osize,
8793 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 Py_DECREF(repunicode);
8795 goto onError;
8796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 for (uni2 = 0; repsize-->0; ++uni2)
8798 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8799 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008802 }
8803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8805 if (!res)
8806 goto onError;
8807 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008808 Py_XDECREF(exc);
8809 Py_XDECREF(errorHandler);
8810 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008814 Py_XDECREF(exc);
8815 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 return NULL;
8817}
8818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819/* Deprecated. Use PyUnicode_Translate instead. */
8820PyObject *
8821PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8822 Py_ssize_t size,
8823 PyObject *mapping,
8824 const char *errors)
8825{
8826 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8827 if (!unicode)
8828 return NULL;
8829 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8830}
8831
Alexander Belopolsky40018472011-02-26 01:02:56 +00008832PyObject *
8833PyUnicode_Translate(PyObject *str,
8834 PyObject *mapping,
8835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836{
8837 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008838
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 str = PyUnicode_FromObject(str);
8840 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 Py_DECREF(str);
8844 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008845
Benjamin Peterson29060642009-01-31 22:14:21 +00008846 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 Py_XDECREF(str);
8848 return NULL;
8849}
Tim Petersced69f82003-09-16 20:30:58 +00008850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008852fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853{
8854 /* No need to call PyUnicode_READY(self) because this function is only
8855 called as a callback from fixup() which does it already. */
8856 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8857 const int kind = PyUnicode_KIND(self);
8858 void *data = PyUnicode_DATA(self);
8859 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008860 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 Py_ssize_t i;
8862
8863 for (i = 0; i < len; ++i) {
8864 ch = PyUnicode_READ(kind, data, i);
8865 fixed = 0;
8866 if (ch > 127) {
8867 if (Py_UNICODE_ISSPACE(ch))
8868 fixed = ' ';
8869 else {
8870 const int decimal = Py_UNICODE_TODECIMAL(ch);
8871 if (decimal >= 0)
8872 fixed = '0' + decimal;
8873 }
8874 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008875 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 if (fixed > maxchar)
8877 maxchar = fixed;
8878 PyUnicode_WRITE(kind, data, i, fixed);
8879 }
8880 else if (ch > maxchar)
8881 maxchar = ch;
8882 }
8883 else if (ch > maxchar)
8884 maxchar = ch;
8885 }
8886
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008887 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888}
8889
8890PyObject *
8891_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8892{
8893 if (!PyUnicode_Check(unicode)) {
8894 PyErr_BadInternalCall();
8895 return NULL;
8896 }
8897 if (PyUnicode_READY(unicode) == -1)
8898 return NULL;
8899 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8900 /* If the string is already ASCII, just return the same string */
8901 Py_INCREF(unicode);
8902 return unicode;
8903 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008904 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905}
8906
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008907PyObject *
8908PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8909 Py_ssize_t length)
8910{
Victor Stinnerf0124502011-11-21 23:12:56 +01008911 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008912 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008913 Py_UCS4 maxchar;
8914 enum PyUnicode_Kind kind;
8915 void *data;
8916
8917 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008918 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008919 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008920 if (ch > 127) {
8921 int decimal = Py_UNICODE_TODECIMAL(ch);
8922 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008923 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008924 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008925 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008926 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008927
8928 /* Copy to a new string */
8929 decimal = PyUnicode_New(length, maxchar);
8930 if (decimal == NULL)
8931 return decimal;
8932 kind = PyUnicode_KIND(decimal);
8933 data = PyUnicode_DATA(decimal);
8934 /* Iterate over code points */
8935 for (i = 0; i < length; i++) {
8936 Py_UNICODE ch = s[i];
8937 if (ch > 127) {
8938 int decimal = Py_UNICODE_TODECIMAL(ch);
8939 if (decimal >= 0)
8940 ch = '0' + decimal;
8941 }
8942 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008944 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008945}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008946/* --- Decimal Encoder ---------------------------------------------------- */
8947
Alexander Belopolsky40018472011-02-26 01:02:56 +00008948int
8949PyUnicode_EncodeDecimal(Py_UNICODE *s,
8950 Py_ssize_t length,
8951 char *output,
8952 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008953{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008954 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008955 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008956 enum PyUnicode_Kind kind;
8957 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008958
8959 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 PyErr_BadArgument();
8961 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008962 }
8963
Victor Stinner42bf7752011-11-21 22:52:58 +01008964 unicode = PyUnicode_FromUnicode(s, length);
8965 if (unicode == NULL)
8966 return -1;
8967
Benjamin Petersonbac79492012-01-14 13:34:47 -05008968 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008969 Py_DECREF(unicode);
8970 return -1;
8971 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008972 kind = PyUnicode_KIND(unicode);
8973 data = PyUnicode_DATA(unicode);
8974
Victor Stinnerb84d7232011-11-22 01:50:07 +01008975 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008976 PyObject *exc;
8977 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008979 Py_ssize_t startpos;
8980
8981 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008982
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008984 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008985 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008987 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 decimal = Py_UNICODE_TODECIMAL(ch);
8989 if (decimal >= 0) {
8990 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008991 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 continue;
8993 }
8994 if (0 < ch && ch < 256) {
8995 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008996 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 continue;
8998 }
Victor Stinner6345be92011-11-25 20:09:01 +01008999
Victor Stinner42bf7752011-11-21 22:52:58 +01009000 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009001 exc = NULL;
9002 raise_encode_exception(&exc, "decimal", unicode,
9003 startpos, startpos+1,
9004 "invalid decimal Unicode string");
9005 Py_XDECREF(exc);
9006 Py_DECREF(unicode);
9007 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009008 }
9009 /* 0-terminate the output string */
9010 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009011 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009012 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009013}
9014
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015/* --- Helpers ------------------------------------------------------------ */
9016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009018any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 Py_ssize_t start,
9020 Py_ssize_t end)
9021{
9022 int kind1, kind2, kind;
9023 void *buf1, *buf2;
9024 Py_ssize_t len1, len2, result;
9025
9026 kind1 = PyUnicode_KIND(s1);
9027 kind2 = PyUnicode_KIND(s2);
9028 kind = kind1 > kind2 ? kind1 : kind2;
9029 buf1 = PyUnicode_DATA(s1);
9030 buf2 = PyUnicode_DATA(s2);
9031 if (kind1 != kind)
9032 buf1 = _PyUnicode_AsKind(s1, kind);
9033 if (!buf1)
9034 return -2;
9035 if (kind2 != kind)
9036 buf2 = _PyUnicode_AsKind(s2, kind);
9037 if (!buf2) {
9038 if (kind1 != kind) PyMem_Free(buf1);
9039 return -2;
9040 }
9041 len1 = PyUnicode_GET_LENGTH(s1);
9042 len2 = PyUnicode_GET_LENGTH(s2);
9043
Victor Stinner794d5672011-10-10 03:21:36 +02009044 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009045 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009046 case PyUnicode_1BYTE_KIND:
9047 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9048 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9049 else
9050 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9051 break;
9052 case PyUnicode_2BYTE_KIND:
9053 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9054 break;
9055 case PyUnicode_4BYTE_KIND:
9056 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9057 break;
9058 default:
9059 assert(0); result = -2;
9060 }
9061 }
9062 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009063 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009064 case PyUnicode_1BYTE_KIND:
9065 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9066 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9067 else
9068 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9069 break;
9070 case PyUnicode_2BYTE_KIND:
9071 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9072 break;
9073 case PyUnicode_4BYTE_KIND:
9074 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9075 break;
9076 default:
9077 assert(0); result = -2;
9078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 }
9080
9081 if (kind1 != kind)
9082 PyMem_Free(buf1);
9083 if (kind2 != kind)
9084 PyMem_Free(buf2);
9085
9086 return result;
9087}
9088
9089Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009090_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 Py_ssize_t n_buffer,
9092 void *digits, Py_ssize_t n_digits,
9093 Py_ssize_t min_width,
9094 const char *grouping,
9095 const char *thousands_sep)
9096{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009097 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009099 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9100 return _PyUnicode_ascii_InsertThousandsGrouping(
9101 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9102 min_width, grouping, thousands_sep);
9103 else
9104 return _PyUnicode_ucs1_InsertThousandsGrouping(
9105 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9106 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 case PyUnicode_2BYTE_KIND:
9108 return _PyUnicode_ucs2_InsertThousandsGrouping(
9109 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9110 min_width, grouping, thousands_sep);
9111 case PyUnicode_4BYTE_KIND:
9112 return _PyUnicode_ucs4_InsertThousandsGrouping(
9113 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9114 min_width, grouping, thousands_sep);
9115 }
9116 assert(0);
9117 return -1;
9118}
9119
9120
Thomas Wouters477c8d52006-05-27 19:21:47 +00009121/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009122#define ADJUST_INDICES(start, end, len) \
9123 if (end > len) \
9124 end = len; \
9125 else if (end < 0) { \
9126 end += len; \
9127 if (end < 0) \
9128 end = 0; \
9129 } \
9130 if (start < 0) { \
9131 start += len; \
9132 if (start < 0) \
9133 start = 0; \
9134 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009135
Alexander Belopolsky40018472011-02-26 01:02:56 +00009136Py_ssize_t
9137PyUnicode_Count(PyObject *str,
9138 PyObject *substr,
9139 Py_ssize_t start,
9140 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009142 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009143 PyObject* str_obj;
9144 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 int kind1, kind2, kind;
9146 void *buf1 = NULL, *buf2 = NULL;
9147 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009148
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009149 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009150 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009152 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009153 if (!sub_obj) {
9154 Py_DECREF(str_obj);
9155 return -1;
9156 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009157 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009158 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 Py_DECREF(str_obj);
9160 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161 }
Tim Petersced69f82003-09-16 20:30:58 +00009162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 kind1 = PyUnicode_KIND(str_obj);
9164 kind2 = PyUnicode_KIND(sub_obj);
9165 kind = kind1 > kind2 ? kind1 : kind2;
9166 buf1 = PyUnicode_DATA(str_obj);
9167 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009168 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 if (!buf1)
9170 goto onError;
9171 buf2 = PyUnicode_DATA(sub_obj);
9172 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009173 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 if (!buf2)
9175 goto onError;
9176 len1 = PyUnicode_GET_LENGTH(str_obj);
9177 len2 = PyUnicode_GET_LENGTH(sub_obj);
9178
9179 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009180 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009182 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9183 result = asciilib_count(
9184 ((Py_UCS1*)buf1) + start, end - start,
9185 buf2, len2, PY_SSIZE_T_MAX
9186 );
9187 else
9188 result = ucs1lib_count(
9189 ((Py_UCS1*)buf1) + start, end - start,
9190 buf2, len2, PY_SSIZE_T_MAX
9191 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 break;
9193 case PyUnicode_2BYTE_KIND:
9194 result = ucs2lib_count(
9195 ((Py_UCS2*)buf1) + start, end - start,
9196 buf2, len2, PY_SSIZE_T_MAX
9197 );
9198 break;
9199 case PyUnicode_4BYTE_KIND:
9200 result = ucs4lib_count(
9201 ((Py_UCS4*)buf1) + start, end - start,
9202 buf2, len2, PY_SSIZE_T_MAX
9203 );
9204 break;
9205 default:
9206 assert(0); result = 0;
9207 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009208
9209 Py_DECREF(sub_obj);
9210 Py_DECREF(str_obj);
9211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 if (kind1 != kind)
9213 PyMem_Free(buf1);
9214 if (kind2 != kind)
9215 PyMem_Free(buf2);
9216
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 onError:
9219 Py_DECREF(sub_obj);
9220 Py_DECREF(str_obj);
9221 if (kind1 != kind && buf1)
9222 PyMem_Free(buf1);
9223 if (kind2 != kind && buf2)
9224 PyMem_Free(buf2);
9225 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226}
9227
Alexander Belopolsky40018472011-02-26 01:02:56 +00009228Py_ssize_t
9229PyUnicode_Find(PyObject *str,
9230 PyObject *sub,
9231 Py_ssize_t start,
9232 Py_ssize_t end,
9233 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009235 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009236
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009238 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009239 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009240 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009241 if (!sub) {
9242 Py_DECREF(str);
9243 return -2;
9244 }
9245 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9246 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009247 Py_DECREF(str);
9248 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249 }
Tim Petersced69f82003-09-16 20:30:58 +00009250
Victor Stinner794d5672011-10-10 03:21:36 +02009251 result = any_find_slice(direction,
9252 str, sub, start, end
9253 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009254
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009256 Py_DECREF(sub);
9257
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 return result;
9259}
9260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261Py_ssize_t
9262PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9263 Py_ssize_t start, Py_ssize_t end,
9264 int direction)
9265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009267 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 if (PyUnicode_READY(str) == -1)
9269 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009270 if (start < 0 || end < 0) {
9271 PyErr_SetString(PyExc_IndexError, "string index out of range");
9272 return -2;
9273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 if (end > PyUnicode_GET_LENGTH(str))
9275 end = PyUnicode_GET_LENGTH(str);
9276 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009277 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9278 kind, end-start, ch, direction);
9279 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009281 else
9282 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283}
9284
Alexander Belopolsky40018472011-02-26 01:02:56 +00009285static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009286tailmatch(PyObject *self,
9287 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009288 Py_ssize_t start,
9289 Py_ssize_t end,
9290 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 int kind_self;
9293 int kind_sub;
9294 void *data_self;
9295 void *data_sub;
9296 Py_ssize_t offset;
9297 Py_ssize_t i;
9298 Py_ssize_t end_sub;
9299
9300 if (PyUnicode_READY(self) == -1 ||
9301 PyUnicode_READY(substring) == -1)
9302 return 0;
9303
9304 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 return 1;
9306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9308 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 kind_self = PyUnicode_KIND(self);
9313 data_self = PyUnicode_DATA(self);
9314 kind_sub = PyUnicode_KIND(substring);
9315 data_sub = PyUnicode_DATA(substring);
9316 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9317
9318 if (direction > 0)
9319 offset = end;
9320 else
9321 offset = start;
9322
9323 if (PyUnicode_READ(kind_self, data_self, offset) ==
9324 PyUnicode_READ(kind_sub, data_sub, 0) &&
9325 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9326 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9327 /* If both are of the same kind, memcmp is sufficient */
9328 if (kind_self == kind_sub) {
9329 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009330 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 data_sub,
9332 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009333 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 }
9335 /* otherwise we have to compare each character by first accesing it */
9336 else {
9337 /* We do not need to compare 0 and len(substring)-1 because
9338 the if statement above ensured already that they are equal
9339 when we end up here. */
9340 // TODO: honor direction and do a forward or backwards search
9341 for (i = 1; i < end_sub; ++i) {
9342 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9343 PyUnicode_READ(kind_sub, data_sub, i))
9344 return 0;
9345 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 }
9349
9350 return 0;
9351}
9352
Alexander Belopolsky40018472011-02-26 01:02:56 +00009353Py_ssize_t
9354PyUnicode_Tailmatch(PyObject *str,
9355 PyObject *substr,
9356 Py_ssize_t start,
9357 Py_ssize_t end,
9358 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009360 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 str = PyUnicode_FromObject(str);
9363 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 substr = PyUnicode_FromObject(substr);
9366 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 Py_DECREF(str);
9368 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
Tim Petersced69f82003-09-16 20:30:58 +00009370
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009371 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 Py_DECREF(str);
9374 Py_DECREF(substr);
9375 return result;
9376}
9377
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378/* Apply fixfct filter to the Unicode object self and return a
9379 reference to the modified object */
9380
Alexander Belopolsky40018472011-02-26 01:02:56 +00009381static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009382fixup(PyObject *self,
9383 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 PyObject *u;
9386 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009387 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009389 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009392 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 /* fix functions return the new maximum character in a string,
9395 if the kind of the resulting unicode object does not change,
9396 everything is fine. Otherwise we need to change the string kind
9397 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009398 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009399
9400 if (maxchar_new == 0) {
9401 /* no changes */;
9402 if (PyUnicode_CheckExact(self)) {
9403 Py_DECREF(u);
9404 Py_INCREF(self);
9405 return self;
9406 }
9407 else
9408 return u;
9409 }
9410
9411 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 maxchar_new = 127;
9413 else if (maxchar_new <= 255)
9414 maxchar_new = 255;
9415 else if (maxchar_new <= 65535)
9416 maxchar_new = 65535;
9417 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009418 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419
Victor Stinnereaab6042011-12-11 22:22:39 +01009420 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009422
9423 /* In case the maximum character changed, we need to
9424 convert the string to the new category. */
9425 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9426 if (v == NULL) {
9427 Py_DECREF(u);
9428 return NULL;
9429 }
9430 if (maxchar_new > maxchar_old) {
9431 /* If the maxchar increased so that the kind changed, not all
9432 characters are representable anymore and we need to fix the
9433 string again. This only happens in very few cases. */
9434 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9435 maxchar_old = fixfct(v);
9436 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 }
9438 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009439 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009441 Py_DECREF(u);
9442 assert(_PyUnicode_CheckConsistency(v, 1));
9443 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444}
9445
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009446static PyObject *
9447ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009449 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9450 char *resdata, *data = PyUnicode_DATA(self);
9451 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009452
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009453 res = PyUnicode_New(len, 127);
9454 if (res == NULL)
9455 return NULL;
9456 resdata = PyUnicode_DATA(res);
9457 if (lower)
9458 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009460 _Py_bytes_upper(resdata, data, len);
9461 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462}
9463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009465handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009467 Py_ssize_t j;
9468 int final_sigma;
9469 Py_UCS4 c;
9470 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009471
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009472 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9473
9474 where ! is a negation and \p{xxx} is a character with property xxx.
9475 */
9476 for (j = i - 1; j >= 0; j--) {
9477 c = PyUnicode_READ(kind, data, j);
9478 if (!_PyUnicode_IsCaseIgnorable(c))
9479 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009481 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9482 if (final_sigma) {
9483 for (j = i + 1; j < length; j++) {
9484 c = PyUnicode_READ(kind, data, j);
9485 if (!_PyUnicode_IsCaseIgnorable(c))
9486 break;
9487 }
9488 final_sigma = j == length || !_PyUnicode_IsCased(c);
9489 }
9490 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491}
9492
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009493static int
9494lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9495 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009497 /* Obscure special case. */
9498 if (c == 0x3A3) {
9499 mapped[0] = handle_capital_sigma(kind, data, length, i);
9500 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009502 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503}
9504
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505static Py_ssize_t
9506do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508 Py_ssize_t i, k = 0;
9509 int n_res, j;
9510 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009511
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009512 c = PyUnicode_READ(kind, data, 0);
9513 n_res = _PyUnicode_ToUpperFull(c, mapped);
9514 for (j = 0; j < n_res; j++) {
9515 if (mapped[j] > *maxchar)
9516 *maxchar = mapped[j];
9517 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009519 for (i = 1; i < length; i++) {
9520 c = PyUnicode_READ(kind, data, i);
9521 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9522 for (j = 0; j < n_res; j++) {
9523 if (mapped[j] > *maxchar)
9524 *maxchar = mapped[j];
9525 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009526 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009527 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009528 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529}
9530
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009531static Py_ssize_t
9532do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9533 Py_ssize_t i, k = 0;
9534
9535 for (i = 0; i < length; i++) {
9536 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9537 int n_res, j;
9538 if (Py_UNICODE_ISUPPER(c)) {
9539 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9540 }
9541 else if (Py_UNICODE_ISLOWER(c)) {
9542 n_res = _PyUnicode_ToUpperFull(c, mapped);
9543 }
9544 else {
9545 n_res = 1;
9546 mapped[0] = c;
9547 }
9548 for (j = 0; j < n_res; j++) {
9549 if (mapped[j] > *maxchar)
9550 *maxchar = mapped[j];
9551 res[k++] = mapped[j];
9552 }
9553 }
9554 return k;
9555}
9556
9557static Py_ssize_t
9558do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9559 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009561 Py_ssize_t i, k = 0;
9562
9563 for (i = 0; i < length; i++) {
9564 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9565 int n_res, j;
9566 if (lower)
9567 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9568 else
9569 n_res = _PyUnicode_ToUpperFull(c, mapped);
9570 for (j = 0; j < n_res; j++) {
9571 if (mapped[j] > *maxchar)
9572 *maxchar = mapped[j];
9573 res[k++] = mapped[j];
9574 }
9575 }
9576 return k;
9577}
9578
9579static Py_ssize_t
9580do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9581{
9582 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9583}
9584
9585static Py_ssize_t
9586do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9587{
9588 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9589}
9590
Benjamin Petersone51757f2012-01-12 21:10:29 -05009591static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009592do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9593{
9594 Py_ssize_t i, k = 0;
9595
9596 for (i = 0; i < length; i++) {
9597 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9598 Py_UCS4 mapped[3];
9599 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9600 for (j = 0; j < n_res; j++) {
9601 if (mapped[j] > *maxchar)
9602 *maxchar = mapped[j];
9603 res[k++] = mapped[j];
9604 }
9605 }
9606 return k;
9607}
9608
9609static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009610do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9611{
9612 Py_ssize_t i, k = 0;
9613 int previous_is_cased;
9614
9615 previous_is_cased = 0;
9616 for (i = 0; i < length; i++) {
9617 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9618 Py_UCS4 mapped[3];
9619 int n_res, j;
9620
9621 if (previous_is_cased)
9622 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9623 else
9624 n_res = _PyUnicode_ToTitleFull(c, mapped);
9625
9626 for (j = 0; j < n_res; j++) {
9627 if (mapped[j] > *maxchar)
9628 *maxchar = mapped[j];
9629 res[k++] = mapped[j];
9630 }
9631
9632 previous_is_cased = _PyUnicode_IsCased(c);
9633 }
9634 return k;
9635}
9636
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637static PyObject *
9638case_operation(PyObject *self,
9639 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9640{
9641 PyObject *res = NULL;
9642 Py_ssize_t length, newlength = 0;
9643 int kind, outkind;
9644 void *data, *outdata;
9645 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9646
9647 if (PyUnicode_READY(self) == -1)
9648 return NULL;
9649
9650 kind = PyUnicode_KIND(self);
9651 data = PyUnicode_DATA(self);
9652 length = PyUnicode_GET_LENGTH(self);
9653 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9654 if (tmp == NULL)
9655 return PyErr_NoMemory();
9656 newlength = perform(kind, data, length, tmp, &maxchar);
9657 res = PyUnicode_New(newlength, maxchar);
9658 if (res == NULL)
9659 goto leave;
9660 tmpend = tmp + newlength;
9661 outdata = PyUnicode_DATA(res);
9662 outkind = PyUnicode_KIND(res);
9663 switch (outkind) {
9664 case PyUnicode_1BYTE_KIND:
9665 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9666 break;
9667 case PyUnicode_2BYTE_KIND:
9668 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9669 break;
9670 case PyUnicode_4BYTE_KIND:
9671 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9672 break;
9673 default:
9674 assert(0);
9675 break;
9676 }
9677 leave:
9678 PyMem_FREE(tmp);
9679 return res;
9680}
9681
Tim Peters8ce9f162004-08-27 01:49:32 +00009682PyObject *
9683PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009686 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009688 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009689 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9690 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009691 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009693 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009695 int use_memcpy;
9696 unsigned char *res_data = NULL, *sep_data = NULL;
9697 PyObject *last_obj;
9698 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699
Tim Peters05eba1f2004-08-27 21:32:02 +00009700 fseq = PySequence_Fast(seq, "");
9701 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009702 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009703 }
9704
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009705 /* NOTE: the following code can't call back into Python code,
9706 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009707 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009708
Tim Peters05eba1f2004-08-27 21:32:02 +00009709 seqlen = PySequence_Fast_GET_SIZE(fseq);
9710 /* If empty sequence, return u"". */
9711 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009712 Py_DECREF(fseq);
9713 Py_INCREF(unicode_empty);
9714 res = unicode_empty;
9715 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009716 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009717
Tim Peters05eba1f2004-08-27 21:32:02 +00009718 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009719 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009720 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009721 if (seqlen == 1) {
9722 if (PyUnicode_CheckExact(items[0])) {
9723 res = items[0];
9724 Py_INCREF(res);
9725 Py_DECREF(fseq);
9726 return res;
9727 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009728 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009729 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009730 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009731 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009732 /* Set up sep and seplen */
9733 if (separator == NULL) {
9734 /* fall back to a blank space separator */
9735 sep = PyUnicode_FromOrdinal(' ');
9736 if (!sep)
9737 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009738 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009739 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009740 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009741 else {
9742 if (!PyUnicode_Check(separator)) {
9743 PyErr_Format(PyExc_TypeError,
9744 "separator: expected str instance,"
9745 " %.80s found",
9746 Py_TYPE(separator)->tp_name);
9747 goto onError;
9748 }
9749 if (PyUnicode_READY(separator))
9750 goto onError;
9751 sep = separator;
9752 seplen = PyUnicode_GET_LENGTH(separator);
9753 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9754 /* inc refcount to keep this code path symmetric with the
9755 above case of a blank separator */
9756 Py_INCREF(sep);
9757 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009758 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009759 }
9760
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009761 /* There are at least two things to join, or else we have a subclass
9762 * of str in the sequence.
9763 * Do a pre-pass to figure out the total amount of space we'll
9764 * need (sz), and see whether all argument are strings.
9765 */
9766 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009767#ifdef Py_DEBUG
9768 use_memcpy = 0;
9769#else
9770 use_memcpy = 1;
9771#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009772 for (i = 0; i < seqlen; i++) {
9773 const Py_ssize_t old_sz = sz;
9774 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009775 if (!PyUnicode_Check(item)) {
9776 PyErr_Format(PyExc_TypeError,
9777 "sequence item %zd: expected str instance,"
9778 " %.80s found",
9779 i, Py_TYPE(item)->tp_name);
9780 goto onError;
9781 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 if (PyUnicode_READY(item) == -1)
9783 goto onError;
9784 sz += PyUnicode_GET_LENGTH(item);
9785 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009786 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009787 if (i != 0)
9788 sz += seplen;
9789 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9790 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009791 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009792 goto onError;
9793 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009794 if (use_memcpy && last_obj != NULL) {
9795 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9796 use_memcpy = 0;
9797 }
9798 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009799 }
Tim Petersced69f82003-09-16 20:30:58 +00009800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009802 if (res == NULL)
9803 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009804
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009805 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009806#ifdef Py_DEBUG
9807 use_memcpy = 0;
9808#else
9809 if (use_memcpy) {
9810 res_data = PyUnicode_1BYTE_DATA(res);
9811 kind = PyUnicode_KIND(res);
9812 if (seplen != 0)
9813 sep_data = PyUnicode_1BYTE_DATA(sep);
9814 }
9815#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009817 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009818 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009819 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009820 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009821 if (use_memcpy) {
9822 Py_MEMCPY(res_data,
9823 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009824 kind * seplen);
9825 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009826 }
9827 else {
9828 copy_characters(res, res_offset, sep, 0, seplen);
9829 res_offset += seplen;
9830 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009831 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009832 itemlen = PyUnicode_GET_LENGTH(item);
9833 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009834 if (use_memcpy) {
9835 Py_MEMCPY(res_data,
9836 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009837 kind * itemlen);
9838 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009839 }
9840 else {
9841 copy_characters(res, res_offset, item, 0, itemlen);
9842 res_offset += itemlen;
9843 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009844 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009845 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009846 if (use_memcpy)
9847 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009848 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009849 else
9850 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009851
Tim Peters05eba1f2004-08-27 21:32:02 +00009852 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856
Benjamin Peterson29060642009-01-31 22:14:21 +00009857 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009858 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009860 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861 return NULL;
9862}
9863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864#define FILL(kind, data, value, start, length) \
9865 do { \
9866 Py_ssize_t i_ = 0; \
9867 assert(kind != PyUnicode_WCHAR_KIND); \
9868 switch ((kind)) { \
9869 case PyUnicode_1BYTE_KIND: { \
9870 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9871 memset(to_, (unsigned char)value, length); \
9872 break; \
9873 } \
9874 case PyUnicode_2BYTE_KIND: { \
9875 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9876 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9877 break; \
9878 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009879 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9881 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9882 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009883 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 } \
9885 } \
9886 } while (0)
9887
Victor Stinner3fe55312012-01-04 00:33:50 +01009888Py_ssize_t
9889PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9890 Py_UCS4 fill_char)
9891{
9892 Py_ssize_t maxlen;
9893 enum PyUnicode_Kind kind;
9894 void *data;
9895
9896 if (!PyUnicode_Check(unicode)) {
9897 PyErr_BadInternalCall();
9898 return -1;
9899 }
9900 if (PyUnicode_READY(unicode) == -1)
9901 return -1;
9902 if (unicode_check_modifiable(unicode))
9903 return -1;
9904
9905 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9906 PyErr_SetString(PyExc_ValueError,
9907 "fill character is bigger than "
9908 "the string maximum character");
9909 return -1;
9910 }
9911
9912 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9913 length = Py_MIN(maxlen, length);
9914 if (length <= 0)
9915 return 0;
9916
9917 kind = PyUnicode_KIND(unicode);
9918 data = PyUnicode_DATA(unicode);
9919 FILL(kind, data, fill_char, start, length);
9920 return length;
9921}
9922
Victor Stinner9310abb2011-10-05 00:59:23 +02009923static PyObject *
9924pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009925 Py_ssize_t left,
9926 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 PyObject *u;
9930 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009931 int kind;
9932 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933
9934 if (left < 0)
9935 left = 0;
9936 if (right < 0)
9937 right = 0;
9938
Victor Stinnerc4b49542011-12-11 22:44:26 +01009939 if (left == 0 && right == 0)
9940 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9943 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009944 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9945 return NULL;
9946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9948 if (fill > maxchar)
9949 maxchar = fill;
9950 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009951 if (!u)
9952 return NULL;
9953
9954 kind = PyUnicode_KIND(u);
9955 data = PyUnicode_DATA(u);
9956 if (left)
9957 FILL(kind, data, fill, 0, left);
9958 if (right)
9959 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009960 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009961 assert(_PyUnicode_CheckConsistency(u, 1));
9962 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965
Alexander Belopolsky40018472011-02-26 01:02:56 +00009966PyObject *
9967PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970
9971 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009972 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009973 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009974 if (PyUnicode_READY(string) == -1) {
9975 Py_DECREF(string);
9976 return NULL;
9977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978
Benjamin Petersonead6b532011-12-20 17:23:42 -06009979 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009981 if (PyUnicode_IS_ASCII(string))
9982 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009983 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009984 PyUnicode_GET_LENGTH(string), keepends);
9985 else
9986 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009987 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009988 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 break;
9990 case PyUnicode_2BYTE_KIND:
9991 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009992 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 PyUnicode_GET_LENGTH(string), keepends);
9994 break;
9995 case PyUnicode_4BYTE_KIND:
9996 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009997 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 PyUnicode_GET_LENGTH(string), keepends);
9999 break;
10000 default:
10001 assert(0);
10002 list = 0;
10003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004 Py_DECREF(string);
10005 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006}
10007
Alexander Belopolsky40018472011-02-26 01:02:56 +000010008static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010009split(PyObject *self,
10010 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010011 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 int kind1, kind2, kind;
10014 void *buf1, *buf2;
10015 Py_ssize_t len1, len2;
10016 PyObject* out;
10017
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010019 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 if (PyUnicode_READY(self) == -1)
10022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010025 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010027 if (PyUnicode_IS_ASCII(self))
10028 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010029 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010030 PyUnicode_GET_LENGTH(self), maxcount
10031 );
10032 else
10033 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010034 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010035 PyUnicode_GET_LENGTH(self), maxcount
10036 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 case PyUnicode_2BYTE_KIND:
10038 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010039 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 PyUnicode_GET_LENGTH(self), maxcount
10041 );
10042 case PyUnicode_4BYTE_KIND:
10043 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 PyUnicode_GET_LENGTH(self), maxcount
10046 );
10047 default:
10048 assert(0);
10049 return NULL;
10050 }
10051
10052 if (PyUnicode_READY(substring) == -1)
10053 return NULL;
10054
10055 kind1 = PyUnicode_KIND(self);
10056 kind2 = PyUnicode_KIND(substring);
10057 kind = kind1 > kind2 ? kind1 : kind2;
10058 buf1 = PyUnicode_DATA(self);
10059 buf2 = PyUnicode_DATA(substring);
10060 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010061 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 if (!buf1)
10063 return NULL;
10064 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010065 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 if (!buf2) {
10067 if (kind1 != kind) PyMem_Free(buf1);
10068 return NULL;
10069 }
10070 len1 = PyUnicode_GET_LENGTH(self);
10071 len2 = PyUnicode_GET_LENGTH(substring);
10072
Benjamin Petersonead6b532011-12-20 17:23:42 -060010073 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010075 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10076 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010077 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010078 else
10079 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010080 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 break;
10082 case PyUnicode_2BYTE_KIND:
10083 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010084 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 break;
10086 case PyUnicode_4BYTE_KIND:
10087 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010088 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 break;
10090 default:
10091 out = NULL;
10092 }
10093 if (kind1 != kind)
10094 PyMem_Free(buf1);
10095 if (kind2 != kind)
10096 PyMem_Free(buf2);
10097 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098}
10099
Alexander Belopolsky40018472011-02-26 01:02:56 +000010100static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010101rsplit(PyObject *self,
10102 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010103 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 int kind1, kind2, kind;
10106 void *buf1, *buf2;
10107 Py_ssize_t len1, len2;
10108 PyObject* out;
10109
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010110 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010111 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 if (PyUnicode_READY(self) == -1)
10114 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010117 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010119 if (PyUnicode_IS_ASCII(self))
10120 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010121 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010122 PyUnicode_GET_LENGTH(self), maxcount
10123 );
10124 else
10125 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010126 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010127 PyUnicode_GET_LENGTH(self), maxcount
10128 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 case PyUnicode_2BYTE_KIND:
10130 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010131 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 PyUnicode_GET_LENGTH(self), maxcount
10133 );
10134 case PyUnicode_4BYTE_KIND:
10135 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010136 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 PyUnicode_GET_LENGTH(self), maxcount
10138 );
10139 default:
10140 assert(0);
10141 return NULL;
10142 }
10143
10144 if (PyUnicode_READY(substring) == -1)
10145 return NULL;
10146
10147 kind1 = PyUnicode_KIND(self);
10148 kind2 = PyUnicode_KIND(substring);
10149 kind = kind1 > kind2 ? kind1 : kind2;
10150 buf1 = PyUnicode_DATA(self);
10151 buf2 = PyUnicode_DATA(substring);
10152 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010153 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 if (!buf1)
10155 return NULL;
10156 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010157 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (!buf2) {
10159 if (kind1 != kind) PyMem_Free(buf1);
10160 return NULL;
10161 }
10162 len1 = PyUnicode_GET_LENGTH(self);
10163 len2 = PyUnicode_GET_LENGTH(substring);
10164
Benjamin Petersonead6b532011-12-20 17:23:42 -060010165 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010167 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10168 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010169 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010170 else
10171 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010172 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 break;
10174 case PyUnicode_2BYTE_KIND:
10175 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010176 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 break;
10178 case PyUnicode_4BYTE_KIND:
10179 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010180 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 break;
10182 default:
10183 out = NULL;
10184 }
10185 if (kind1 != kind)
10186 PyMem_Free(buf1);
10187 if (kind2 != kind)
10188 PyMem_Free(buf2);
10189 return out;
10190}
10191
10192static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10194 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010196 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010198 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10199 return asciilib_find(buf1, len1, buf2, len2, offset);
10200 else
10201 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 case PyUnicode_2BYTE_KIND:
10203 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10204 case PyUnicode_4BYTE_KIND:
10205 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10206 }
10207 assert(0);
10208 return -1;
10209}
10210
10211static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10213 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010215 switch (kind) {
10216 case PyUnicode_1BYTE_KIND:
10217 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10218 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10219 else
10220 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10221 case PyUnicode_2BYTE_KIND:
10222 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10223 case PyUnicode_4BYTE_KIND:
10224 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10225 }
10226 assert(0);
10227 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010228}
10229
Alexander Belopolsky40018472011-02-26 01:02:56 +000010230static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231replace(PyObject *self, PyObject *str1,
10232 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 PyObject *u;
10235 char *sbuf = PyUnicode_DATA(self);
10236 char *buf1 = PyUnicode_DATA(str1);
10237 char *buf2 = PyUnicode_DATA(str2);
10238 int srelease = 0, release1 = 0, release2 = 0;
10239 int skind = PyUnicode_KIND(self);
10240 int kind1 = PyUnicode_KIND(str1);
10241 int kind2 = PyUnicode_KIND(str2);
10242 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10243 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10244 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010245 int mayshrink;
10246 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
10248 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010249 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010251 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252
Victor Stinner59de0ee2011-10-07 10:01:28 +020010253 if (str1 == str2)
10254 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 if (skind < kind1)
10256 /* substring too wide to be present */
10257 goto nothing;
10258
Victor Stinner49a0a212011-10-12 23:46:10 +020010259 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10260 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10261 /* Replacing str1 with str2 may cause a maxchar reduction in the
10262 result string. */
10263 mayshrink = (maxchar_str2 < maxchar);
10264 maxchar = Py_MAX(maxchar, maxchar_str2);
10265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010267 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010269 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010271 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010272 Py_UCS4 u1, u2;
10273 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010274 Py_ssize_t index, pos;
10275 char *src;
10276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010278 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10279 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010280 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010283 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010285 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010287
10288 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10289 index = 0;
10290 src = sbuf;
10291 while (--maxcount)
10292 {
10293 pos++;
10294 src += pos * PyUnicode_KIND(self);
10295 slen -= pos;
10296 index += pos;
10297 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10298 if (pos < 0)
10299 break;
10300 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10301 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010302 }
10303 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 int rkind = skind;
10305 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010306 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 if (kind1 < rkind) {
10309 /* widen substring */
10310 buf1 = _PyUnicode_AsKind(str1, rkind);
10311 if (!buf1) goto error;
10312 release1 = 1;
10313 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010314 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010315 if (i < 0)
10316 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (rkind > kind2) {
10318 /* widen replacement */
10319 buf2 = _PyUnicode_AsKind(str2, rkind);
10320 if (!buf2) goto error;
10321 release2 = 1;
10322 }
10323 else if (rkind < kind2) {
10324 /* widen self and buf1 */
10325 rkind = kind2;
10326 if (release1) PyMem_Free(buf1);
10327 sbuf = _PyUnicode_AsKind(self, rkind);
10328 if (!sbuf) goto error;
10329 srelease = 1;
10330 buf1 = _PyUnicode_AsKind(str1, rkind);
10331 if (!buf1) goto error;
10332 release1 = 1;
10333 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010334 u = PyUnicode_New(slen, maxchar);
10335 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010337 assert(PyUnicode_KIND(u) == rkind);
10338 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010339
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010340 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010341 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010342 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010344 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010346
10347 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010348 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010349 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010351 if (i == -1)
10352 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010355 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010357 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010359 }
10360 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 Py_ssize_t n, i, j, ires;
10362 Py_ssize_t product, new_size;
10363 int rkind = skind;
10364 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010367 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 buf1 = _PyUnicode_AsKind(str1, rkind);
10369 if (!buf1) goto error;
10370 release1 = 1;
10371 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010372 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010373 if (n == 0)
10374 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010376 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 buf2 = _PyUnicode_AsKind(str2, rkind);
10378 if (!buf2) goto error;
10379 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010382 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 rkind = kind2;
10384 sbuf = _PyUnicode_AsKind(self, rkind);
10385 if (!sbuf) goto error;
10386 srelease = 1;
10387 if (release1) PyMem_Free(buf1);
10388 buf1 = _PyUnicode_AsKind(str1, rkind);
10389 if (!buf1) goto error;
10390 release1 = 1;
10391 }
10392 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10393 PyUnicode_GET_LENGTH(str1))); */
10394 product = n * (len2-len1);
10395 if ((product / (len2-len1)) != n) {
10396 PyErr_SetString(PyExc_OverflowError,
10397 "replace string is too long");
10398 goto error;
10399 }
10400 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010401 if (new_size == 0) {
10402 Py_INCREF(unicode_empty);
10403 u = unicode_empty;
10404 goto done;
10405 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10407 PyErr_SetString(PyExc_OverflowError,
10408 "replace string is too long");
10409 goto error;
10410 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010411 u = PyUnicode_New(new_size, maxchar);
10412 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010414 assert(PyUnicode_KIND(u) == rkind);
10415 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 ires = i = 0;
10417 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010418 while (n-- > 0) {
10419 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010420 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010421 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010423 if (j == -1)
10424 break;
10425 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010426 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010427 memcpy(res + rkind * ires,
10428 sbuf + rkind * i,
10429 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010431 }
10432 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010434 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010436 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010442 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010443 memcpy(res + rkind * ires,
10444 sbuf + rkind * i,
10445 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010446 }
10447 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010448 /* interleave */
10449 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010450 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010452 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 if (--n <= 0)
10455 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010456 memcpy(res + rkind * ires,
10457 sbuf + rkind * i,
10458 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 ires++;
10460 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010462 memcpy(res + rkind * ires,
10463 sbuf + rkind * i,
10464 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010466 }
10467
10468 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010469 unicode_adjust_maxchar(&u);
10470 if (u == NULL)
10471 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010473
10474 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (srelease)
10476 PyMem_FREE(sbuf);
10477 if (release1)
10478 PyMem_FREE(buf1);
10479 if (release2)
10480 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010481 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010483
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010485 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (srelease)
10487 PyMem_FREE(sbuf);
10488 if (release1)
10489 PyMem_FREE(buf1);
10490 if (release2)
10491 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010492 return unicode_result_unchanged(self);
10493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 error:
10495 if (srelease && sbuf)
10496 PyMem_FREE(sbuf);
10497 if (release1 && buf1)
10498 PyMem_FREE(buf1);
10499 if (release2 && buf2)
10500 PyMem_FREE(buf2);
10501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502}
10503
10504/* --- Unicode Object Methods --------------------------------------------- */
10505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010506PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010507 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508\n\
10509Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010510characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511
10512static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010513unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010515 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516}
10517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010518PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520\n\
10521Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010522have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523
10524static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010525unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010527 if (PyUnicode_READY(self) == -1)
10528 return NULL;
10529 if (PyUnicode_GET_LENGTH(self) == 0)
10530 return unicode_result_unchanged(self);
10531 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532}
10533
Benjamin Petersond5890c82012-01-14 13:23:30 -050010534PyDoc_STRVAR(casefold__doc__,
10535 "S.casefold() -> str\n\
10536\n\
10537Return a version of S suitable for caseless comparisons.");
10538
10539static PyObject *
10540unicode_casefold(PyObject *self)
10541{
10542 if (PyUnicode_READY(self) == -1)
10543 return NULL;
10544 if (PyUnicode_IS_ASCII(self))
10545 return ascii_upper_or_lower(self, 1);
10546 return case_operation(self, do_casefold);
10547}
10548
10549
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010550/* Argument converter. Coerces to a single unicode character */
10551
10552static int
10553convert_uc(PyObject *obj, void *addr)
10554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010556 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010557
Benjamin Peterson14339b62009-01-31 16:36:08 +000010558 uniobj = PyUnicode_FromObject(obj);
10559 if (uniobj == NULL) {
10560 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010561 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010562 return 0;
10563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010565 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010566 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010567 Py_DECREF(uniobj);
10568 return 0;
10569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010571 Py_DECREF(uniobj);
10572 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010573}
10574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010575PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010578Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010579done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580
10581static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010582unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010584 Py_ssize_t marg, left;
10585 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 Py_UCS4 fillchar = ' ';
10587
Victor Stinnere9a29352011-10-01 02:14:59 +020010588 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590
Benjamin Petersonbac79492012-01-14 13:34:47 -050010591 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592 return NULL;
10593
Victor Stinnerc4b49542011-12-11 22:44:26 +010010594 if (PyUnicode_GET_LENGTH(self) >= width)
10595 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596
Victor Stinnerc4b49542011-12-11 22:44:26 +010010597 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598 left = marg / 2 + (marg & width & 1);
10599
Victor Stinner9310abb2011-10-05 00:59:23 +020010600 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601}
10602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603/* This function assumes that str1 and str2 are readied by the caller. */
10604
Marc-André Lemburge5034372000-08-08 08:04:29 +000010605static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010606unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 int kind1, kind2;
10609 void *data1, *data2;
10610 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 kind1 = PyUnicode_KIND(str1);
10613 kind2 = PyUnicode_KIND(str2);
10614 data1 = PyUnicode_DATA(str1);
10615 data2 = PyUnicode_DATA(str2);
10616 len1 = PyUnicode_GET_LENGTH(str1);
10617 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 for (i = 0; i < len1 && i < len2; ++i) {
10620 Py_UCS4 c1, c2;
10621 c1 = PyUnicode_READ(kind1, data1, i);
10622 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010623
10624 if (c1 != c2)
10625 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010626 }
10627
10628 return (len1 < len2) ? -1 : (len1 != len2);
10629}
10630
Alexander Belopolsky40018472011-02-26 01:02:56 +000010631int
10632PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10635 if (PyUnicode_READY(left) == -1 ||
10636 PyUnicode_READY(right) == -1)
10637 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010638 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010640 PyErr_Format(PyExc_TypeError,
10641 "Can't compare %.100s and %.100s",
10642 left->ob_type->tp_name,
10643 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644 return -1;
10645}
10646
Martin v. Löwis5b222132007-06-10 09:51:05 +000010647int
10648PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 Py_ssize_t i;
10651 int kind;
10652 void *data;
10653 Py_UCS4 chr;
10654
Victor Stinner910337b2011-10-03 03:20:16 +020010655 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (PyUnicode_READY(uni) == -1)
10657 return -1;
10658 kind = PyUnicode_KIND(uni);
10659 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010660 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10662 if (chr != str[i])
10663 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010664 /* This check keeps Python strings that end in '\0' from comparing equal
10665 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010668 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010670 return 0;
10671}
10672
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010673
Benjamin Peterson29060642009-01-31 22:14:21 +000010674#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010675 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010676
Alexander Belopolsky40018472011-02-26 01:02:56 +000010677PyObject *
10678PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010679{
10680 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010681
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010682 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10683 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (PyUnicode_READY(left) == -1 ||
10685 PyUnicode_READY(right) == -1)
10686 return NULL;
10687 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10688 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010689 if (op == Py_EQ) {
10690 Py_INCREF(Py_False);
10691 return Py_False;
10692 }
10693 if (op == Py_NE) {
10694 Py_INCREF(Py_True);
10695 return Py_True;
10696 }
10697 }
10698 if (left == right)
10699 result = 0;
10700 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010701 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010702
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010703 /* Convert the return value to a Boolean */
10704 switch (op) {
10705 case Py_EQ:
10706 v = TEST_COND(result == 0);
10707 break;
10708 case Py_NE:
10709 v = TEST_COND(result != 0);
10710 break;
10711 case Py_LE:
10712 v = TEST_COND(result <= 0);
10713 break;
10714 case Py_GE:
10715 v = TEST_COND(result >= 0);
10716 break;
10717 case Py_LT:
10718 v = TEST_COND(result == -1);
10719 break;
10720 case Py_GT:
10721 v = TEST_COND(result == 1);
10722 break;
10723 default:
10724 PyErr_BadArgument();
10725 return NULL;
10726 }
10727 Py_INCREF(v);
10728 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010729 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010730
Brian Curtindfc80e32011-08-10 20:28:54 -050010731 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010732}
10733
Alexander Belopolsky40018472011-02-26 01:02:56 +000010734int
10735PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010736{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010737 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 int kind1, kind2, kind;
10739 void *buf1, *buf2;
10740 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010741 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010742
10743 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010744 sub = PyUnicode_FromObject(element);
10745 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 PyErr_Format(PyExc_TypeError,
10747 "'in <string>' requires string as left operand, not %s",
10748 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010749 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010750 }
10751
Thomas Wouters477c8d52006-05-27 19:21:47 +000010752 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010753 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010754 Py_DECREF(sub);
10755 return -1;
10756 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010757 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10758 Py_DECREF(sub);
10759 Py_DECREF(str);
10760 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 kind1 = PyUnicode_KIND(str);
10763 kind2 = PyUnicode_KIND(sub);
10764 kind = kind1 > kind2 ? kind1 : kind2;
10765 buf1 = PyUnicode_DATA(str);
10766 buf2 = PyUnicode_DATA(sub);
10767 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010768 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 if (!buf1) {
10770 Py_DECREF(sub);
10771 return -1;
10772 }
10773 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010774 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 if (!buf2) {
10776 Py_DECREF(sub);
10777 if (kind1 != kind) PyMem_Free(buf1);
10778 return -1;
10779 }
10780 len1 = PyUnicode_GET_LENGTH(str);
10781 len2 = PyUnicode_GET_LENGTH(sub);
10782
Benjamin Petersonead6b532011-12-20 17:23:42 -060010783 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 case PyUnicode_1BYTE_KIND:
10785 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10786 break;
10787 case PyUnicode_2BYTE_KIND:
10788 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10789 break;
10790 case PyUnicode_4BYTE_KIND:
10791 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10792 break;
10793 default:
10794 result = -1;
10795 assert(0);
10796 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010797
10798 Py_DECREF(str);
10799 Py_DECREF(sub);
10800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 if (kind1 != kind)
10802 PyMem_Free(buf1);
10803 if (kind2 != kind)
10804 PyMem_Free(buf2);
10805
Guido van Rossum403d68b2000-03-13 15:55:09 +000010806 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010807}
10808
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809/* Concat to string or Unicode object giving a new Unicode object. */
10810
Alexander Belopolsky40018472011-02-26 01:02:56 +000010811PyObject *
10812PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010815 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010816 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817
10818 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010821 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010824 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825
10826 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010827 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010828 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010831 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010832 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 }
10835
Victor Stinner488fa492011-12-12 00:01:39 +010010836 u_len = PyUnicode_GET_LENGTH(u);
10837 v_len = PyUnicode_GET_LENGTH(v);
10838 if (u_len > PY_SSIZE_T_MAX - v_len) {
10839 PyErr_SetString(PyExc_OverflowError,
10840 "strings are too large to concat");
10841 goto onError;
10842 }
10843 new_len = u_len + v_len;
10844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010846 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10847 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010850 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010852 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010853 copy_characters(w, 0, u, 0, u_len);
10854 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855 Py_DECREF(u);
10856 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010857 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859
Benjamin Peterson29060642009-01-31 22:14:21 +000010860 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861 Py_XDECREF(u);
10862 Py_XDECREF(v);
10863 return NULL;
10864}
10865
Walter Dörwald1ab83302007-05-18 17:15:44 +000010866void
Victor Stinner23e56682011-10-03 03:54:37 +020010867PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010868{
Victor Stinner23e56682011-10-03 03:54:37 +020010869 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010870 Py_UCS4 maxchar, maxchar2;
10871 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010872
10873 if (p_left == NULL) {
10874 if (!PyErr_Occurred())
10875 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010876 return;
10877 }
Victor Stinner23e56682011-10-03 03:54:37 +020010878 left = *p_left;
10879 if (right == NULL || !PyUnicode_Check(left)) {
10880 if (!PyErr_Occurred())
10881 PyErr_BadInternalCall();
10882 goto error;
10883 }
10884
Benjamin Petersonbac79492012-01-14 13:34:47 -050010885 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010886 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010887 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010888 goto error;
10889
Victor Stinner488fa492011-12-12 00:01:39 +010010890 /* Shortcuts */
10891 if (left == unicode_empty) {
10892 Py_DECREF(left);
10893 Py_INCREF(right);
10894 *p_left = right;
10895 return;
10896 }
10897 if (right == unicode_empty)
10898 return;
10899
10900 left_len = PyUnicode_GET_LENGTH(left);
10901 right_len = PyUnicode_GET_LENGTH(right);
10902 if (left_len > PY_SSIZE_T_MAX - right_len) {
10903 PyErr_SetString(PyExc_OverflowError,
10904 "strings are too large to concat");
10905 goto error;
10906 }
10907 new_len = left_len + right_len;
10908
10909 if (unicode_modifiable(left)
10910 && PyUnicode_CheckExact(right)
10911 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010912 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10913 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010914 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010915 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010916 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10917 {
10918 /* append inplace */
10919 if (unicode_resize(p_left, new_len) != 0) {
10920 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10921 * deallocated so it cannot be put back into
10922 * 'variable'. The MemoryError is raised when there
10923 * is no value in 'variable', which might (very
10924 * remotely) be a cause of incompatibilities.
10925 */
10926 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010927 }
Victor Stinner488fa492011-12-12 00:01:39 +010010928 /* copy 'right' into the newly allocated area of 'left' */
10929 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010930 }
Victor Stinner488fa492011-12-12 00:01:39 +010010931 else {
10932 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10933 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10934 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010935
Victor Stinner488fa492011-12-12 00:01:39 +010010936 /* Concat the two Unicode strings */
10937 res = PyUnicode_New(new_len, maxchar);
10938 if (res == NULL)
10939 goto error;
10940 copy_characters(res, 0, left, 0, left_len);
10941 copy_characters(res, left_len, right, 0, right_len);
10942 Py_DECREF(left);
10943 *p_left = res;
10944 }
10945 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010946 return;
10947
10948error:
Victor Stinner488fa492011-12-12 00:01:39 +010010949 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010950}
10951
10952void
10953PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10954{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010955 PyUnicode_Append(pleft, right);
10956 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010957}
10958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010959PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010960 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010962Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010963string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010964interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965
10966static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010967unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010969 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010970 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010971 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 int kind1, kind2, kind;
10974 void *buf1, *buf2;
10975 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976
Jesus Ceaac451502011-04-20 17:09:23 +020010977 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10978 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 kind1 = PyUnicode_KIND(self);
10982 kind2 = PyUnicode_KIND(substring);
10983 kind = kind1 > kind2 ? kind1 : kind2;
10984 buf1 = PyUnicode_DATA(self);
10985 buf2 = PyUnicode_DATA(substring);
10986 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010987 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 if (!buf1) {
10989 Py_DECREF(substring);
10990 return NULL;
10991 }
10992 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010993 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 if (!buf2) {
10995 Py_DECREF(substring);
10996 if (kind1 != kind) PyMem_Free(buf1);
10997 return NULL;
10998 }
10999 len1 = PyUnicode_GET_LENGTH(self);
11000 len2 = PyUnicode_GET_LENGTH(substring);
11001
11002 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011003 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 case PyUnicode_1BYTE_KIND:
11005 iresult = ucs1lib_count(
11006 ((Py_UCS1*)buf1) + start, end - start,
11007 buf2, len2, PY_SSIZE_T_MAX
11008 );
11009 break;
11010 case PyUnicode_2BYTE_KIND:
11011 iresult = ucs2lib_count(
11012 ((Py_UCS2*)buf1) + start, end - start,
11013 buf2, len2, PY_SSIZE_T_MAX
11014 );
11015 break;
11016 case PyUnicode_4BYTE_KIND:
11017 iresult = ucs4lib_count(
11018 ((Py_UCS4*)buf1) + start, end - start,
11019 buf2, len2, PY_SSIZE_T_MAX
11020 );
11021 break;
11022 default:
11023 assert(0); iresult = 0;
11024 }
11025
11026 result = PyLong_FromSsize_t(iresult);
11027
11028 if (kind1 != kind)
11029 PyMem_Free(buf1);
11030 if (kind2 != kind)
11031 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032
11033 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011034
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 return result;
11036}
11037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011038PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011039 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011041Encode S using the codec registered for encoding. Default encoding\n\
11042is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011043handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011044a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11045'xmlcharrefreplace' as well as any other name registered with\n\
11046codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047
11048static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011049unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011051 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052 char *encoding = NULL;
11053 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011054
Benjamin Peterson308d6372009-09-18 21:42:35 +000011055 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11056 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011058 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011059}
11060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011061PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011062 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063\n\
11064Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011065If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066
11067static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011068unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011070 Py_ssize_t i, j, line_pos, src_len, incr;
11071 Py_UCS4 ch;
11072 PyObject *u;
11073 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011075 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011076 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
11078 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080
Antoine Pitrou22425222011-10-04 19:10:51 +020011081 if (PyUnicode_READY(self) == -1)
11082 return NULL;
11083
Thomas Wouters7e474022000-07-16 12:04:32 +000011084 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011085 src_len = PyUnicode_GET_LENGTH(self);
11086 i = j = line_pos = 0;
11087 kind = PyUnicode_KIND(self);
11088 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011089 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011090 for (; i < src_len; i++) {
11091 ch = PyUnicode_READ(kind, src_data, i);
11092 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011093 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011094 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011095 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011096 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011097 goto overflow;
11098 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011099 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011100 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011103 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011104 goto overflow;
11105 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011107 if (ch == '\n' || ch == '\r')
11108 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011110 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011111 if (!found)
11112 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011113
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011115 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 if (!u)
11117 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011118 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119
Antoine Pitroue71d5742011-10-04 15:55:09 +020011120 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121
Antoine Pitroue71d5742011-10-04 15:55:09 +020011122 for (; i < src_len; i++) {
11123 ch = PyUnicode_READ(kind, src_data, i);
11124 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011126 incr = tabsize - (line_pos % tabsize);
11127 line_pos += incr;
11128 while (incr--) {
11129 PyUnicode_WRITE(kind, dest_data, j, ' ');
11130 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011131 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011132 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011133 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011134 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011135 line_pos++;
11136 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011137 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011138 if (ch == '\n' || ch == '\r')
11139 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011141 }
11142 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011143 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011144
Antoine Pitroue71d5742011-10-04 15:55:09 +020011145 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011146 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148}
11149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011150PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011151 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152\n\
11153Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011154such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155arguments start and end are interpreted as in slice notation.\n\
11156\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011157Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158
11159static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011162 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011163 Py_ssize_t start;
11164 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011165 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166
Jesus Ceaac451502011-04-20 17:09:23 +020011167 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11168 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 if (PyUnicode_READY(self) == -1)
11172 return NULL;
11173 if (PyUnicode_READY(substring) == -1)
11174 return NULL;
11175
Victor Stinner7931d9a2011-11-04 00:22:48 +010011176 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177
11178 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (result == -2)
11181 return NULL;
11182
Christian Heimes217cfd12007-12-02 14:31:20 +000011183 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184}
11185
11186static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011187unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011189 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11190 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193}
11194
Guido van Rossumc2504932007-09-18 19:42:40 +000011195/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011196 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011197static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011198unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199{
Guido van Rossumc2504932007-09-18 19:42:40 +000011200 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011201 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 if (_PyUnicode_HASH(self) != -1)
11204 return _PyUnicode_HASH(self);
11205 if (PyUnicode_READY(self) == -1)
11206 return -1;
11207 len = PyUnicode_GET_LENGTH(self);
11208
11209 /* The hash function as a macro, gets expanded three times below. */
11210#define HASH(P) \
11211 x = (Py_uhash_t)*P << 7; \
11212 while (--len >= 0) \
11213 x = (1000003*x) ^ (Py_uhash_t)*P++;
11214
11215 switch (PyUnicode_KIND(self)) {
11216 case PyUnicode_1BYTE_KIND: {
11217 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11218 HASH(c);
11219 break;
11220 }
11221 case PyUnicode_2BYTE_KIND: {
11222 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11223 HASH(s);
11224 break;
11225 }
11226 default: {
11227 Py_UCS4 *l;
11228 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11229 "Impossible switch case in unicode_hash");
11230 l = PyUnicode_4BYTE_DATA(self);
11231 HASH(l);
11232 break;
11233 }
11234 }
11235 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11236
Guido van Rossumc2504932007-09-18 19:42:40 +000011237 if (x == -1)
11238 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011240 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011244PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011245 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011247Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
11249static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011252 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011253 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011254 Py_ssize_t start;
11255 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256
Jesus Ceaac451502011-04-20 17:09:23 +020011257 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11258 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 if (PyUnicode_READY(self) == -1)
11262 return NULL;
11263 if (PyUnicode_READY(substring) == -1)
11264 return NULL;
11265
Victor Stinner7931d9a2011-11-04 00:22:48 +010011266 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267
11268 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 if (result == -2)
11271 return NULL;
11272
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 if (result < 0) {
11274 PyErr_SetString(PyExc_ValueError, "substring not found");
11275 return NULL;
11276 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011277
Christian Heimes217cfd12007-12-02 14:31:20 +000011278 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279}
11280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011281PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011282 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011284Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011285at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
11287static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011288unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 Py_ssize_t i, length;
11291 int kind;
11292 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 int cased;
11294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 if (PyUnicode_READY(self) == -1)
11296 return NULL;
11297 length = PyUnicode_GET_LENGTH(self);
11298 kind = PyUnicode_KIND(self);
11299 data = PyUnicode_DATA(self);
11300
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 if (length == 1)
11303 return PyBool_FromLong(
11304 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011306 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011308 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011309
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 for (i = 0; i < length; i++) {
11312 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011313
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11315 return PyBool_FromLong(0);
11316 else if (!cased && Py_UNICODE_ISLOWER(ch))
11317 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011319 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320}
11321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011322PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011325Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011326at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327
11328static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011329unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 Py_ssize_t i, length;
11332 int kind;
11333 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334 int cased;
11335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 if (PyUnicode_READY(self) == -1)
11337 return NULL;
11338 length = PyUnicode_GET_LENGTH(self);
11339 kind = PyUnicode_KIND(self);
11340 data = PyUnicode_DATA(self);
11341
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 if (length == 1)
11344 return PyBool_FromLong(
11345 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011347 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011349 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011350
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 for (i = 0; i < length; i++) {
11353 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011354
Benjamin Peterson29060642009-01-31 22:14:21 +000011355 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11356 return PyBool_FromLong(0);
11357 else if (!cased && Py_UNICODE_ISUPPER(ch))
11358 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011360 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361}
11362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011363PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011366Return True if S is a titlecased string and there is at least one\n\
11367character in S, i.e. upper- and titlecase characters may only\n\
11368follow uncased characters and lowercase characters only cased ones.\n\
11369Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
11371static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011372unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 Py_ssize_t i, length;
11375 int kind;
11376 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377 int cased, previous_is_cased;
11378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 if (PyUnicode_READY(self) == -1)
11380 return NULL;
11381 length = PyUnicode_GET_LENGTH(self);
11382 kind = PyUnicode_KIND(self);
11383 data = PyUnicode_DATA(self);
11384
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 if (length == 1) {
11387 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11388 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11389 (Py_UNICODE_ISUPPER(ch) != 0));
11390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011392 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011395
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396 cased = 0;
11397 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 for (i = 0; i < length; i++) {
11399 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011400
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11402 if (previous_is_cased)
11403 return PyBool_FromLong(0);
11404 previous_is_cased = 1;
11405 cased = 1;
11406 }
11407 else if (Py_UNICODE_ISLOWER(ch)) {
11408 if (!previous_is_cased)
11409 return PyBool_FromLong(0);
11410 previous_is_cased = 1;
11411 cased = 1;
11412 }
11413 else
11414 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011416 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417}
11418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011419PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011420 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011422Return True if all characters in S are whitespace\n\
11423and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
11425static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011426unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 Py_ssize_t i, length;
11429 int kind;
11430 void *data;
11431
11432 if (PyUnicode_READY(self) == -1)
11433 return NULL;
11434 length = PyUnicode_GET_LENGTH(self);
11435 kind = PyUnicode_KIND(self);
11436 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 if (length == 1)
11440 return PyBool_FromLong(
11441 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011443 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 for (i = 0; i < length; i++) {
11448 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011449 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011452 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453}
11454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011455PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011457\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011458Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011459and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011460
11461static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011462unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 Py_ssize_t i, length;
11465 int kind;
11466 void *data;
11467
11468 if (PyUnicode_READY(self) == -1)
11469 return NULL;
11470 length = PyUnicode_GET_LENGTH(self);
11471 kind = PyUnicode_KIND(self);
11472 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011473
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011474 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 if (length == 1)
11476 return PyBool_FromLong(
11477 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011478
11479 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 for (i = 0; i < length; i++) {
11484 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011486 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011487 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011488}
11489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011490PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011492\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011493Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011494and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011495
11496static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011497unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 int kind;
11500 void *data;
11501 Py_ssize_t len, i;
11502
11503 if (PyUnicode_READY(self) == -1)
11504 return NULL;
11505
11506 kind = PyUnicode_KIND(self);
11507 data = PyUnicode_DATA(self);
11508 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011509
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011510 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 if (len == 1) {
11512 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11513 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11514 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011515
11516 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 for (i = 0; i < len; i++) {
11521 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011522 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011525 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011526}
11527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011528PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011531Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533
11534static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011535unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 Py_ssize_t i, length;
11538 int kind;
11539 void *data;
11540
11541 if (PyUnicode_READY(self) == -1)
11542 return NULL;
11543 length = PyUnicode_GET_LENGTH(self);
11544 kind = PyUnicode_KIND(self);
11545 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 if (length == 1)
11549 return PyBool_FromLong(
11550 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011552 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 for (i = 0; i < length; i++) {
11557 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011560 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561}
11562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011563PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011566Return True if all characters in S are digits\n\
11567and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568
11569static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011570unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 Py_ssize_t i, length;
11573 int kind;
11574 void *data;
11575
11576 if (PyUnicode_READY(self) == -1)
11577 return NULL;
11578 length = PyUnicode_GET_LENGTH(self);
11579 kind = PyUnicode_KIND(self);
11580 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 if (length == 1) {
11584 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11585 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011588 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 for (i = 0; i < length; i++) {
11593 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011596 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597}
11598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011599PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011602Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011603False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604
11605static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011606unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 Py_ssize_t i, length;
11609 int kind;
11610 void *data;
11611
11612 if (PyUnicode_READY(self) == -1)
11613 return NULL;
11614 length = PyUnicode_GET_LENGTH(self);
11615 kind = PyUnicode_KIND(self);
11616 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 if (length == 1)
11620 return PyBool_FromLong(
11621 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011623 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011625 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 for (i = 0; i < length; i++) {
11628 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011631 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632}
11633
Martin v. Löwis47383402007-08-15 07:32:56 +000011634int
11635PyUnicode_IsIdentifier(PyObject *self)
11636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 int kind;
11638 void *data;
11639 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011640 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 if (PyUnicode_READY(self) == -1) {
11643 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011644 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 }
11646
11647 /* Special case for empty strings */
11648 if (PyUnicode_GET_LENGTH(self) == 0)
11649 return 0;
11650 kind = PyUnicode_KIND(self);
11651 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011652
11653 /* PEP 3131 says that the first character must be in
11654 XID_Start and subsequent characters in XID_Continue,
11655 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011656 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011657 letters, digits, underscore). However, given the current
11658 definition of XID_Start and XID_Continue, it is sufficient
11659 to check just for these, except that _ must be allowed
11660 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011662 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011663 return 0;
11664
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011665 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011668 return 1;
11669}
11670
11671PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011673\n\
11674Return True if S is a valid identifier according\n\
11675to the language definition.");
11676
11677static PyObject*
11678unicode_isidentifier(PyObject *self)
11679{
11680 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11681}
11682
Georg Brandl559e5d72008-06-11 18:37:52 +000011683PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011685\n\
11686Return True if all characters in S are considered\n\
11687printable in repr() or S is empty, False otherwise.");
11688
11689static PyObject*
11690unicode_isprintable(PyObject *self)
11691{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 Py_ssize_t i, length;
11693 int kind;
11694 void *data;
11695
11696 if (PyUnicode_READY(self) == -1)
11697 return NULL;
11698 length = PyUnicode_GET_LENGTH(self);
11699 kind = PyUnicode_KIND(self);
11700 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011701
11702 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 if (length == 1)
11704 return PyBool_FromLong(
11705 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 for (i = 0; i < length; i++) {
11708 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011709 Py_RETURN_FALSE;
11710 }
11711 }
11712 Py_RETURN_TRUE;
11713}
11714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011715PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011716 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717\n\
11718Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011719iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720
11721static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011722unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011724 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725}
11726
Martin v. Löwis18e16552006-02-15 17:27:45 +000011727static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011728unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 if (PyUnicode_READY(self) == -1)
11731 return -1;
11732 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733}
11734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011735PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011738Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011739done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740
11741static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011742unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011744 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 Py_UCS4 fillchar = ' ';
11746
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011747 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 return NULL;
11749
Benjamin Petersonbac79492012-01-14 13:34:47 -050011750 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752
Victor Stinnerc4b49542011-12-11 22:44:26 +010011753 if (PyUnicode_GET_LENGTH(self) >= width)
11754 return unicode_result_unchanged(self);
11755
11756 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757}
11758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011759PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
11764static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011765unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011767 if (PyUnicode_READY(self) == -1)
11768 return NULL;
11769 if (PyUnicode_IS_ASCII(self))
11770 return ascii_upper_or_lower(self, 1);
11771 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772}
11773
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774#define LEFTSTRIP 0
11775#define RIGHTSTRIP 1
11776#define BOTHSTRIP 2
11777
11778/* Arrays indexed by above */
11779static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11780
11781#define STRIPNAME(i) (stripformat[i]+3)
11782
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011783/* externally visible for str.strip(unicode) */
11784PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011785_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 void *data;
11788 int kind;
11789 Py_ssize_t i, j, len;
11790 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11793 return NULL;
11794
11795 kind = PyUnicode_KIND(self);
11796 data = PyUnicode_DATA(self);
11797 len = PyUnicode_GET_LENGTH(self);
11798 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11799 PyUnicode_DATA(sepobj),
11800 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011801
Benjamin Peterson14339b62009-01-31 16:36:08 +000011802 i = 0;
11803 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 while (i < len &&
11805 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 i++;
11807 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011808 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011809
Benjamin Peterson14339b62009-01-31 16:36:08 +000011810 j = len;
11811 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 do {
11813 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 } while (j >= i &&
11815 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011817 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011818
Victor Stinner7931d9a2011-11-04 00:22:48 +010011819 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820}
11821
11822PyObject*
11823PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11824{
11825 unsigned char *data;
11826 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011827 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828
Victor Stinnerde636f32011-10-01 03:55:54 +020011829 if (PyUnicode_READY(self) == -1)
11830 return NULL;
11831
11832 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11833
Victor Stinner12bab6d2011-10-01 01:53:49 +020011834 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011835 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836
Victor Stinner12bab6d2011-10-01 01:53:49 +020011837 length = end - start;
11838 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011839 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840
Victor Stinnerde636f32011-10-01 03:55:54 +020011841 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011842 PyErr_SetString(PyExc_IndexError, "string index out of range");
11843 return NULL;
11844 }
11845
Victor Stinnerb9275c12011-10-05 14:01:42 +020011846 if (PyUnicode_IS_ASCII(self)) {
11847 kind = PyUnicode_KIND(self);
11848 data = PyUnicode_1BYTE_DATA(self);
11849 return unicode_fromascii(data + start, length);
11850 }
11851 else {
11852 kind = PyUnicode_KIND(self);
11853 data = PyUnicode_1BYTE_DATA(self);
11854 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011855 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011856 length);
11857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859
11860static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011861do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 int kind;
11864 void *data;
11865 Py_ssize_t len, i, j;
11866
11867 if (PyUnicode_READY(self) == -1)
11868 return NULL;
11869
11870 kind = PyUnicode_KIND(self);
11871 data = PyUnicode_DATA(self);
11872 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011873
Benjamin Peterson14339b62009-01-31 16:36:08 +000011874 i = 0;
11875 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011877 i++;
11878 }
11879 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011880
Benjamin Peterson14339b62009-01-31 16:36:08 +000011881 j = len;
11882 if (striptype != LEFTSTRIP) {
11883 do {
11884 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011886 j++;
11887 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011888
Victor Stinner7931d9a2011-11-04 00:22:48 +010011889 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890}
11891
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011892
11893static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011894do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011895{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011896 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011897
Benjamin Peterson14339b62009-01-31 16:36:08 +000011898 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11899 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011900
Benjamin Peterson14339b62009-01-31 16:36:08 +000011901 if (sep != NULL && sep != Py_None) {
11902 if (PyUnicode_Check(sep))
11903 return _PyUnicode_XStrip(self, striptype, sep);
11904 else {
11905 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 "%s arg must be None or str",
11907 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011908 return NULL;
11909 }
11910 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011911
Benjamin Peterson14339b62009-01-31 16:36:08 +000011912 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011913}
11914
11915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011916PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011917 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011918\n\
11919Return a copy of the string S with leading and trailing\n\
11920whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011921If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011922
11923static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011924unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011925{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011926 if (PyTuple_GET_SIZE(args) == 0)
11927 return do_strip(self, BOTHSTRIP); /* Common case */
11928 else
11929 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011930}
11931
11932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011933PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011935\n\
11936Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011937If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011938
11939static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011940unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011941{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011942 if (PyTuple_GET_SIZE(args) == 0)
11943 return do_strip(self, LEFTSTRIP); /* Common case */
11944 else
11945 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011946}
11947
11948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011949PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011951\n\
11952Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011953If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011954
11955static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011956unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011957{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011958 if (PyTuple_GET_SIZE(args) == 0)
11959 return do_strip(self, RIGHTSTRIP); /* Common case */
11960 else
11961 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011962}
11963
11964
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011966unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011968 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
Georg Brandl222de0f2009-04-12 12:01:50 +000011971 if (len < 1) {
11972 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011973 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
Victor Stinnerc4b49542011-12-11 22:44:26 +010011976 /* no repeat, return original string */
11977 if (len == 1)
11978 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011979
Benjamin Petersonbac79492012-01-14 13:34:47 -050011980 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 return NULL;
11982
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011983 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011984 PyErr_SetString(PyExc_OverflowError,
11985 "repeated string is too long");
11986 return NULL;
11987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011989
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011990 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991 if (!u)
11992 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011993 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (PyUnicode_GET_LENGTH(str) == 1) {
11996 const int kind = PyUnicode_KIND(str);
11997 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011998 if (kind == PyUnicode_1BYTE_KIND) {
11999 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012000 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012001 }
12002 else if (kind == PyUnicode_2BYTE_KIND) {
12003 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012004 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012005 ucs2[n] = fill_char;
12006 } else {
12007 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12008 assert(kind == PyUnicode_4BYTE_KIND);
12009 for (n = 0; n < len; ++n)
12010 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 }
12013 else {
12014 /* number of characters copied this far */
12015 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012016 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 char *to = (char *) PyUnicode_DATA(u);
12018 Py_MEMCPY(to, PyUnicode_DATA(str),
12019 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 n = (done <= nchars-done) ? done : nchars-done;
12022 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012023 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025 }
12026
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012027 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012028 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029}
12030
Alexander Belopolsky40018472011-02-26 01:02:56 +000012031PyObject *
12032PyUnicode_Replace(PyObject *obj,
12033 PyObject *subobj,
12034 PyObject *replobj,
12035 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036{
12037 PyObject *self;
12038 PyObject *str1;
12039 PyObject *str2;
12040 PyObject *result;
12041
12042 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012043 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012046 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 Py_DECREF(self);
12048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 }
12050 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012051 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 Py_DECREF(self);
12053 Py_DECREF(str1);
12054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012056 if (PyUnicode_READY(self) == -1 ||
12057 PyUnicode_READY(str1) == -1 ||
12058 PyUnicode_READY(str2) == -1)
12059 result = NULL;
12060 else
12061 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 Py_DECREF(self);
12063 Py_DECREF(str1);
12064 Py_DECREF(str2);
12065 return result;
12066}
12067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012068PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012069 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070\n\
12071Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012072old replaced by new. If the optional argument count is\n\
12073given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074
12075static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 PyObject *str1;
12079 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012080 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 PyObject *result;
12082
Martin v. Löwis18e16552006-02-15 17:27:45 +000012083 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012085 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012086 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012088 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 return NULL;
12090 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012091 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 Py_DECREF(str1);
12093 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012094 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012095 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12096 result = NULL;
12097 else
12098 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099
12100 Py_DECREF(str1);
12101 Py_DECREF(str2);
12102 return result;
12103}
12104
Alexander Belopolsky40018472011-02-26 01:02:56 +000012105static PyObject *
12106unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012108 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 Py_ssize_t isize;
12110 Py_ssize_t osize, squote, dquote, i, o;
12111 Py_UCS4 max, quote;
12112 int ikind, okind;
12113 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012116 return NULL;
12117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 isize = PyUnicode_GET_LENGTH(unicode);
12119 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 /* Compute length of output, quote characters, and
12122 maximum character */
12123 osize = 2; /* quotes */
12124 max = 127;
12125 squote = dquote = 0;
12126 ikind = PyUnicode_KIND(unicode);
12127 for (i = 0; i < isize; i++) {
12128 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12129 switch (ch) {
12130 case '\'': squote++; osize++; break;
12131 case '"': dquote++; osize++; break;
12132 case '\\': case '\t': case '\r': case '\n':
12133 osize += 2; break;
12134 default:
12135 /* Fast-path ASCII */
12136 if (ch < ' ' || ch == 0x7f)
12137 osize += 4; /* \xHH */
12138 else if (ch < 0x7f)
12139 osize++;
12140 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12141 osize++;
12142 max = ch > max ? ch : max;
12143 }
12144 else if (ch < 0x100)
12145 osize += 4; /* \xHH */
12146 else if (ch < 0x10000)
12147 osize += 6; /* \uHHHH */
12148 else
12149 osize += 10; /* \uHHHHHHHH */
12150 }
12151 }
12152
12153 quote = '\'';
12154 if (squote) {
12155 if (dquote)
12156 /* Both squote and dquote present. Use squote,
12157 and escape them */
12158 osize += squote;
12159 else
12160 quote = '"';
12161 }
12162
12163 repr = PyUnicode_New(osize, max);
12164 if (repr == NULL)
12165 return NULL;
12166 okind = PyUnicode_KIND(repr);
12167 odata = PyUnicode_DATA(repr);
12168
12169 PyUnicode_WRITE(okind, odata, 0, quote);
12170 PyUnicode_WRITE(okind, odata, osize-1, quote);
12171
12172 for (i = 0, o = 1; i < isize; i++) {
12173 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012174
12175 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 if ((ch == quote) || (ch == '\\')) {
12177 PyUnicode_WRITE(okind, odata, o++, '\\');
12178 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012179 continue;
12180 }
12181
Benjamin Peterson29060642009-01-31 22:14:21 +000012182 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012183 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 PyUnicode_WRITE(okind, odata, o++, '\\');
12185 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012186 }
12187 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 PyUnicode_WRITE(okind, odata, o++, '\\');
12189 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012190 }
12191 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 PyUnicode_WRITE(okind, odata, o++, '\\');
12193 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012194 }
12195
12196 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012197 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 PyUnicode_WRITE(okind, odata, o++, '\\');
12199 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012200 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12201 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012202 }
12203
Georg Brandl559e5d72008-06-11 18:37:52 +000012204 /* Copy ASCII characters as-is */
12205 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012207 }
12208
Benjamin Peterson29060642009-01-31 22:14:21 +000012209 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012210 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012211 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012212 (categories Z* and C* except ASCII space)
12213 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012215 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 if (ch <= 0xff) {
12217 PyUnicode_WRITE(okind, odata, o++, '\\');
12218 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012219 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12220 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012221 }
12222 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 else if (ch >= 0x10000) {
12224 PyUnicode_WRITE(okind, odata, o++, '\\');
12225 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012226 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12227 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12228 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12229 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12230 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12231 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12232 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12233 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012234 }
12235 /* Map 16-bit characters to '\uxxxx' */
12236 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 PyUnicode_WRITE(okind, odata, o++, '\\');
12238 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012239 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12240 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12241 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12242 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012243 }
12244 }
12245 /* Copy characters as-is */
12246 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012248 }
12249 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012252 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012253 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254}
12255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012256PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012257 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258\n\
12259Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012260such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261arguments start and end are interpreted as in slice notation.\n\
12262\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012263Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264
12265static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012268 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012269 Py_ssize_t start;
12270 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012271 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272
Jesus Ceaac451502011-04-20 17:09:23 +020012273 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12274 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 if (PyUnicode_READY(self) == -1)
12278 return NULL;
12279 if (PyUnicode_READY(substring) == -1)
12280 return NULL;
12281
Victor Stinner7931d9a2011-11-04 00:22:48 +010012282 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
12284 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 if (result == -2)
12287 return NULL;
12288
Christian Heimes217cfd12007-12-02 14:31:20 +000012289 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290}
12291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012292PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012293 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012295Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
12297static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012300 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012301 Py_ssize_t start;
12302 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012303 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304
Jesus Ceaac451502011-04-20 17:09:23 +020012305 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12306 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (PyUnicode_READY(self) == -1)
12310 return NULL;
12311 if (PyUnicode_READY(substring) == -1)
12312 return NULL;
12313
Victor Stinner7931d9a2011-11-04 00:22:48 +010012314 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315
12316 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 if (result == -2)
12319 return NULL;
12320
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321 if (result < 0) {
12322 PyErr_SetString(PyExc_ValueError, "substring not found");
12323 return NULL;
12324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325
Christian Heimes217cfd12007-12-02 14:31:20 +000012326 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327}
12328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012329PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012332Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012333done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334
12335static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012336unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012338 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 Py_UCS4 fillchar = ' ';
12340
Victor Stinnere9a29352011-10-01 02:14:59 +020012341 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012343
Benjamin Petersonbac79492012-01-14 13:34:47 -050012344 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345 return NULL;
12346
Victor Stinnerc4b49542011-12-11 22:44:26 +010012347 if (PyUnicode_GET_LENGTH(self) >= width)
12348 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349
Victor Stinnerc4b49542011-12-11 22:44:26 +010012350 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351}
12352
Alexander Belopolsky40018472011-02-26 01:02:56 +000012353PyObject *
12354PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355{
12356 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012357
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358 s = PyUnicode_FromObject(s);
12359 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012360 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 if (sep != NULL) {
12362 sep = PyUnicode_FromObject(sep);
12363 if (sep == NULL) {
12364 Py_DECREF(s);
12365 return NULL;
12366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367 }
12368
Victor Stinner9310abb2011-10-05 00:59:23 +020012369 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370
12371 Py_DECREF(s);
12372 Py_XDECREF(sep);
12373 return result;
12374}
12375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012376PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378\n\
12379Return a list of the words in S, using sep as the\n\
12380delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012381splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012382whitespace string is a separator and empty strings are\n\
12383removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384
12385static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012386unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387{
12388 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012389 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390
Martin v. Löwis18e16552006-02-15 17:27:45 +000012391 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392 return NULL;
12393
12394 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012397 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012398 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012399 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400}
12401
Thomas Wouters477c8d52006-05-27 19:21:47 +000012402PyObject *
12403PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12404{
12405 PyObject* str_obj;
12406 PyObject* sep_obj;
12407 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 int kind1, kind2, kind;
12409 void *buf1 = NULL, *buf2 = NULL;
12410 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012411
12412 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012413 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012415 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012416 if (!sep_obj) {
12417 Py_DECREF(str_obj);
12418 return NULL;
12419 }
12420 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12421 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012422 Py_DECREF(str_obj);
12423 return NULL;
12424 }
12425
Victor Stinner14f8f022011-10-05 20:58:25 +020012426 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012428 kind = Py_MAX(kind1, kind2);
12429 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012431 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 if (!buf1)
12433 goto onError;
12434 buf2 = PyUnicode_DATA(sep_obj);
12435 if (kind2 != kind)
12436 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12437 if (!buf2)
12438 goto onError;
12439 len1 = PyUnicode_GET_LENGTH(str_obj);
12440 len2 = PyUnicode_GET_LENGTH(sep_obj);
12441
Benjamin Petersonead6b532011-12-20 17:23:42 -060012442 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012444 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12445 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12446 else
12447 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 break;
12449 case PyUnicode_2BYTE_KIND:
12450 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12451 break;
12452 case PyUnicode_4BYTE_KIND:
12453 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12454 break;
12455 default:
12456 assert(0);
12457 out = 0;
12458 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012459
12460 Py_DECREF(sep_obj);
12461 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 if (kind1 != kind)
12463 PyMem_Free(buf1);
12464 if (kind2 != kind)
12465 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012466
12467 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 onError:
12469 Py_DECREF(sep_obj);
12470 Py_DECREF(str_obj);
12471 if (kind1 != kind && buf1)
12472 PyMem_Free(buf1);
12473 if (kind2 != kind && buf2)
12474 PyMem_Free(buf2);
12475 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012476}
12477
12478
12479PyObject *
12480PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12481{
12482 PyObject* str_obj;
12483 PyObject* sep_obj;
12484 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 int kind1, kind2, kind;
12486 void *buf1 = NULL, *buf2 = NULL;
12487 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012488
12489 str_obj = PyUnicode_FromObject(str_in);
12490 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012492 sep_obj = PyUnicode_FromObject(sep_in);
12493 if (!sep_obj) {
12494 Py_DECREF(str_obj);
12495 return NULL;
12496 }
12497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 kind1 = PyUnicode_KIND(str_in);
12499 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012500 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012501 buf1 = PyUnicode_DATA(str_in);
12502 if (kind1 != kind)
12503 buf1 = _PyUnicode_AsKind(str_in, kind);
12504 if (!buf1)
12505 goto onError;
12506 buf2 = PyUnicode_DATA(sep_obj);
12507 if (kind2 != kind)
12508 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12509 if (!buf2)
12510 goto onError;
12511 len1 = PyUnicode_GET_LENGTH(str_obj);
12512 len2 = PyUnicode_GET_LENGTH(sep_obj);
12513
Benjamin Petersonead6b532011-12-20 17:23:42 -060012514 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012516 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12517 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12518 else
12519 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 break;
12521 case PyUnicode_2BYTE_KIND:
12522 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12523 break;
12524 case PyUnicode_4BYTE_KIND:
12525 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12526 break;
12527 default:
12528 assert(0);
12529 out = 0;
12530 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012531
12532 Py_DECREF(sep_obj);
12533 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 if (kind1 != kind)
12535 PyMem_Free(buf1);
12536 if (kind2 != kind)
12537 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012538
12539 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 onError:
12541 Py_DECREF(sep_obj);
12542 Py_DECREF(str_obj);
12543 if (kind1 != kind && buf1)
12544 PyMem_Free(buf1);
12545 if (kind2 != kind && buf2)
12546 PyMem_Free(buf2);
12547 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012548}
12549
12550PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012552\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012553Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012554the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012555found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012556
12557static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012558unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012559{
Victor Stinner9310abb2011-10-05 00:59:23 +020012560 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012561}
12562
12563PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012564 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012565\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012566Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012567the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012568separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012569
12570static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012571unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012572{
Victor Stinner9310abb2011-10-05 00:59:23 +020012573 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012574}
12575
Alexander Belopolsky40018472011-02-26 01:02:56 +000012576PyObject *
12577PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012578{
12579 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012580
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012581 s = PyUnicode_FromObject(s);
12582 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012583 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 if (sep != NULL) {
12585 sep = PyUnicode_FromObject(sep);
12586 if (sep == NULL) {
12587 Py_DECREF(s);
12588 return NULL;
12589 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012590 }
12591
Victor Stinner9310abb2011-10-05 00:59:23 +020012592 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012593
12594 Py_DECREF(s);
12595 Py_XDECREF(sep);
12596 return result;
12597}
12598
12599PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012601\n\
12602Return a list of the words in S, using sep as the\n\
12603delimiter string, starting at the end of the string and\n\
12604working to the front. If maxsplit is given, at most maxsplit\n\
12605splits are done. If sep is not specified, any whitespace string\n\
12606is a separator.");
12607
12608static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012609unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012610{
12611 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012612 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012613
Martin v. Löwis18e16552006-02-15 17:27:45 +000012614 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012615 return NULL;
12616
12617 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012619 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012620 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012621 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012622 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012623}
12624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012625PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012626 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627\n\
12628Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012629Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012630is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631
12632static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012633unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012635 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012636 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012638 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12639 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640 return NULL;
12641
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012642 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643}
12644
12645static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012646PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012648 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649}
12650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012651PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653\n\
12654Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012655and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656
12657static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012658unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012660 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661}
12662
Georg Brandlceee0772007-11-27 23:48:05 +000012663PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012664 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012665\n\
12666Return a translation table usable for str.translate().\n\
12667If there is only one argument, it must be a dictionary mapping Unicode\n\
12668ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012669Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012670If there are two arguments, they must be strings of equal length, and\n\
12671in the resulting dictionary, each character in x will be mapped to the\n\
12672character at the same position in y. If there is a third argument, it\n\
12673must be a string, whose characters will be mapped to None in the result.");
12674
12675static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012676unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012677{
12678 PyObject *x, *y = NULL, *z = NULL;
12679 PyObject *new = NULL, *key, *value;
12680 Py_ssize_t i = 0;
12681 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012682
Georg Brandlceee0772007-11-27 23:48:05 +000012683 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12684 return NULL;
12685 new = PyDict_New();
12686 if (!new)
12687 return NULL;
12688 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 int x_kind, y_kind, z_kind;
12690 void *x_data, *y_data, *z_data;
12691
Georg Brandlceee0772007-11-27 23:48:05 +000012692 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012693 if (!PyUnicode_Check(x)) {
12694 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12695 "be a string if there is a second argument");
12696 goto err;
12697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012699 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12700 "arguments must have equal length");
12701 goto err;
12702 }
12703 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 x_kind = PyUnicode_KIND(x);
12705 y_kind = PyUnicode_KIND(y);
12706 x_data = PyUnicode_DATA(x);
12707 y_data = PyUnicode_DATA(y);
12708 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12709 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012710 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012711 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012712 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012713 if (!value) {
12714 Py_DECREF(key);
12715 goto err;
12716 }
Georg Brandlceee0772007-11-27 23:48:05 +000012717 res = PyDict_SetItem(new, key, value);
12718 Py_DECREF(key);
12719 Py_DECREF(value);
12720 if (res < 0)
12721 goto err;
12722 }
12723 /* create entries for deleting chars in z */
12724 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 z_kind = PyUnicode_KIND(z);
12726 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012727 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012729 if (!key)
12730 goto err;
12731 res = PyDict_SetItem(new, key, Py_None);
12732 Py_DECREF(key);
12733 if (res < 0)
12734 goto err;
12735 }
12736 }
12737 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 int kind;
12739 void *data;
12740
Georg Brandlceee0772007-11-27 23:48:05 +000012741 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012742 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012743 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12744 "to maketrans it must be a dict");
12745 goto err;
12746 }
12747 /* copy entries into the new dict, converting string keys to int keys */
12748 while (PyDict_Next(x, &i, &key, &value)) {
12749 if (PyUnicode_Check(key)) {
12750 /* convert string keys to integer keys */
12751 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012752 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012753 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12754 "table must be of length 1");
12755 goto err;
12756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 kind = PyUnicode_KIND(key);
12758 data = PyUnicode_DATA(key);
12759 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012760 if (!newkey)
12761 goto err;
12762 res = PyDict_SetItem(new, newkey, value);
12763 Py_DECREF(newkey);
12764 if (res < 0)
12765 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012766 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012767 /* just keep integer keys */
12768 if (PyDict_SetItem(new, key, value) < 0)
12769 goto err;
12770 } else {
12771 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12772 "be strings or integers");
12773 goto err;
12774 }
12775 }
12776 }
12777 return new;
12778 err:
12779 Py_DECREF(new);
12780 return NULL;
12781}
12782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012783PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785\n\
12786Return a copy of the string S, where all characters have been mapped\n\
12787through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012788Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012789Unmapped characters are left untouched. Characters mapped to None\n\
12790are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791
12792static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796}
12797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012798PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012799 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012801Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802
12803static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012804unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012806 if (PyUnicode_READY(self) == -1)
12807 return NULL;
12808 if (PyUnicode_IS_ASCII(self))
12809 return ascii_upper_or_lower(self, 0);
12810 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811}
12812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012813PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012816Pad a numeric string S with zeros on the left, to fill a field\n\
12817of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818
12819static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012820unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012822 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012823 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012824 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 int kind;
12826 void *data;
12827 Py_UCS4 chr;
12828
Martin v. Löwis18e16552006-02-15 17:27:45 +000012829 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830 return NULL;
12831
Benjamin Petersonbac79492012-01-14 13:34:47 -050012832 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834
Victor Stinnerc4b49542011-12-11 22:44:26 +010012835 if (PyUnicode_GET_LENGTH(self) >= width)
12836 return unicode_result_unchanged(self);
12837
12838 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839
12840 u = pad(self, fill, 0, '0');
12841
Walter Dörwald068325e2002-04-15 13:36:47 +000012842 if (u == NULL)
12843 return NULL;
12844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012845 kind = PyUnicode_KIND(u);
12846 data = PyUnicode_DATA(u);
12847 chr = PyUnicode_READ(kind, data, fill);
12848
12849 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 PyUnicode_WRITE(kind, data, 0, chr);
12852 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853 }
12854
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012855 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012856 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858
12859#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012860static PyObject *
12861unicode__decimal2ascii(PyObject *self)
12862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012864}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865#endif
12866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012867PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012870Return True if S starts with the specified prefix, False otherwise.\n\
12871With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012872With optional end, stop comparing S at that position.\n\
12873prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874
12875static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012876unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012879 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012880 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012881 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012882 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012883 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884
Jesus Ceaac451502011-04-20 17:09:23 +020012885 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012887 if (PyTuple_Check(subobj)) {
12888 Py_ssize_t i;
12889 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012890 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012891 if (substring == NULL)
12892 return NULL;
12893 result = tailmatch(self, substring, start, end, -1);
12894 Py_DECREF(substring);
12895 if (result) {
12896 Py_RETURN_TRUE;
12897 }
12898 }
12899 /* nothing matched */
12900 Py_RETURN_FALSE;
12901 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012902 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012903 if (substring == NULL) {
12904 if (PyErr_ExceptionMatches(PyExc_TypeError))
12905 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12906 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012908 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012909 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012911 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912}
12913
12914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012915PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012916 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012918Return True if S ends with the specified suffix, False otherwise.\n\
12919With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012920With optional end, stop comparing S at that position.\n\
12921suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922
12923static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012924unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012925 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012927 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012928 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012929 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012930 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012931 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932
Jesus Ceaac451502011-04-20 17:09:23 +020012933 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012935 if (PyTuple_Check(subobj)) {
12936 Py_ssize_t i;
12937 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012938 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012939 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012940 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012941 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012942 result = tailmatch(self, substring, start, end, +1);
12943 Py_DECREF(substring);
12944 if (result) {
12945 Py_RETURN_TRUE;
12946 }
12947 }
12948 Py_RETURN_FALSE;
12949 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012950 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012951 if (substring == NULL) {
12952 if (PyErr_ExceptionMatches(PyExc_TypeError))
12953 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12954 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012955 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012956 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012957 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012959 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960}
12961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012963
12964PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012965 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012966\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012967Return a formatted version of S, using substitutions from args and kwargs.\n\
12968The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012969
Eric Smith27bbca62010-11-04 17:06:58 +000012970PyDoc_STRVAR(format_map__doc__,
12971 "S.format_map(mapping) -> str\n\
12972\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012973Return a formatted version of S, using substitutions from mapping.\n\
12974The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012975
Eric Smith4a7d76d2008-05-30 18:10:19 +000012976static PyObject *
12977unicode__format__(PyObject* self, PyObject* args)
12978{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012979 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012980
12981 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12982 return NULL;
12983
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012984 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012986 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012987}
12988
Eric Smith8c663262007-08-25 02:26:07 +000012989PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012990 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012991\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012992Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012993
12994static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012995unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 Py_ssize_t size;
12998
12999 /* If it's a compact object, account for base structure +
13000 character data. */
13001 if (PyUnicode_IS_COMPACT_ASCII(v))
13002 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13003 else if (PyUnicode_IS_COMPACT(v))
13004 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013005 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 else {
13007 /* If it is a two-block object, account for base object, and
13008 for character block if present. */
13009 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013010 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013012 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 }
13014 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013015 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013016 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013018 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013019 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020
13021 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013022}
13023
13024PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013025 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013026
13027static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013028unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013029{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013030 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013031 if (!copy)
13032 return NULL;
13033 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013034}
13035
Guido van Rossumd57fd912000-03-10 22:53:23 +000013036static PyMethodDef unicode_methods[] = {
13037
13038 /* Order is according to common usage: often used methods should
13039 appear first, since lookup is done sequentially. */
13040
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013041 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013042 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13043 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013044 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013045 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13046 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013047 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013048 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13049 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13050 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13051 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13052 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013054 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13055 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13056 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013057 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013058 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13059 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13060 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013061 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013062 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013063 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013064 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013065 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13066 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13067 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13068 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13069 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13070 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13071 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13072 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13073 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13074 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13075 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13076 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13077 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13078 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013079 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013080 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013081 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013082 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013083 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013084 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013085 {"maketrans", (PyCFunction) unicode_maketrans,
13086 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013087 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013088#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013089 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013090 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091#endif
13092
Benjamin Peterson14339b62009-01-31 16:36:08 +000013093 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094 {NULL, NULL}
13095};
13096
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013097static PyObject *
13098unicode_mod(PyObject *v, PyObject *w)
13099{
Brian Curtindfc80e32011-08-10 20:28:54 -050013100 if (!PyUnicode_Check(v))
13101 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013102 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013103}
13104
13105static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013106 0, /*nb_add*/
13107 0, /*nb_subtract*/
13108 0, /*nb_multiply*/
13109 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013110};
13111
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013113 (lenfunc) unicode_length, /* sq_length */
13114 PyUnicode_Concat, /* sq_concat */
13115 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13116 (ssizeargfunc) unicode_getitem, /* sq_item */
13117 0, /* sq_slice */
13118 0, /* sq_ass_item */
13119 0, /* sq_ass_slice */
13120 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121};
13122
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013123static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013124unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 if (PyUnicode_READY(self) == -1)
13127 return NULL;
13128
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013129 if (PyIndex_Check(item)) {
13130 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013131 if (i == -1 && PyErr_Occurred())
13132 return NULL;
13133 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013135 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013136 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013137 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013138 PyObject *result;
13139 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013140 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013141 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013145 return NULL;
13146 }
13147
13148 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013149 Py_INCREF(unicode_empty);
13150 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013151 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013152 slicelength == PyUnicode_GET_LENGTH(self)) {
13153 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013154 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013155 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013156 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013157 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013158 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013159 src_kind = PyUnicode_KIND(self);
13160 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013161 if (!PyUnicode_IS_ASCII(self)) {
13162 kind_limit = kind_maxchar_limit(src_kind);
13163 max_char = 0;
13164 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13165 ch = PyUnicode_READ(src_kind, src_data, cur);
13166 if (ch > max_char) {
13167 max_char = ch;
13168 if (max_char >= kind_limit)
13169 break;
13170 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013171 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013172 }
Victor Stinner55c99112011-10-13 01:17:06 +020013173 else
13174 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013175 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013176 if (result == NULL)
13177 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013178 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013179 dest_data = PyUnicode_DATA(result);
13180
13181 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013182 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13183 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013184 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013185 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013186 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013187 } else {
13188 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13189 return NULL;
13190 }
13191}
13192
13193static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013194 (lenfunc)unicode_length, /* mp_length */
13195 (binaryfunc)unicode_subscript, /* mp_subscript */
13196 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013197};
13198
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200/* Helpers for PyUnicode_Format() */
13201
13202static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013203getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013205 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013206 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013207 (*p_argidx)++;
13208 if (arglen < 0)
13209 return args;
13210 else
13211 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212 }
13213 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013214 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215 return NULL;
13216}
13217
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013218/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013220static PyObject *
13221formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013223 char *p;
13224 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013225 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013226
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227 x = PyFloat_AsDouble(v);
13228 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013229 return NULL;
13230
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013232 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013233
Eric Smith0923d1d2009-04-16 20:16:10 +000013234 p = PyOS_double_to_string(x, type, prec,
13235 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013236 if (p == NULL)
13237 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013238 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013239 PyMem_Free(p);
13240 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241}
13242
Tim Peters38fd5b62000-09-21 05:43:11 +000013243static PyObject*
13244formatlong(PyObject *val, int flags, int prec, int type)
13245{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013246 char *buf;
13247 int len;
13248 PyObject *str; /* temporary string object. */
13249 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013250
Benjamin Peterson14339b62009-01-31 16:36:08 +000013251 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13252 if (!str)
13253 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013255 Py_DECREF(str);
13256 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013257}
13258
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013259static Py_UCS4
13260formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013262 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013263 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013265 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013266 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013267 goto onError;
13268 }
13269 else {
13270 /* Integer input truncated to a character */
13271 long x;
13272 x = PyLong_AsLong(v);
13273 if (x == -1 && PyErr_Occurred())
13274 goto onError;
13275
Victor Stinner8faf8212011-12-08 22:14:11 +010013276 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013277 PyErr_SetString(PyExc_OverflowError,
13278 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013279 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 }
13281
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013282 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013283 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013284
Benjamin Peterson29060642009-01-31 22:14:21 +000013285 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013286 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013287 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013288 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289}
13290
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013291static int
13292repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13293{
13294 int r;
13295 assert(count > 0);
13296 assert(PyUnicode_Check(obj));
13297 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013298 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013299 if (repeated == NULL)
13300 return -1;
13301 r = _PyAccu_Accumulate(acc, repeated);
13302 Py_DECREF(repeated);
13303 return r;
13304 }
13305 else {
13306 do {
13307 if (_PyAccu_Accumulate(acc, obj))
13308 return -1;
13309 } while (--count);
13310 return 0;
13311 }
13312}
13313
Alexander Belopolsky40018472011-02-26 01:02:56 +000013314PyObject *
13315PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013317 void *fmt;
13318 int fmtkind;
13319 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013321 int r;
13322 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013325 PyObject *temp = NULL;
13326 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013327 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013328 _PyAccu acc;
13329 static PyObject *plus, *minus, *blank, *zero, *percent;
13330
13331 if (!plus && !(plus = get_latin1_char('+')))
13332 return NULL;
13333 if (!minus && !(minus = get_latin1_char('-')))
13334 return NULL;
13335 if (!blank && !(blank = get_latin1_char(' ')))
13336 return NULL;
13337 if (!zero && !(zero = get_latin1_char('0')))
13338 return NULL;
13339 if (!percent && !(percent = get_latin1_char('%')))
13340 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013341
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 PyErr_BadInternalCall();
13344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013346 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013347 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013348 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013349 if (PyUnicode_READY(uformat) == -1)
13350 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013351 if (_PyAccu_Init(&acc))
13352 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013353 fmt = PyUnicode_DATA(uformat);
13354 fmtkind = PyUnicode_KIND(uformat);
13355 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13356 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 arglen = PyTuple_Size(args);
13360 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361 }
13362 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013363 arglen = -1;
13364 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013366 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013367 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013368 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013369
13370 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013372 PyObject *nonfmt;
13373 Py_ssize_t nonfmtpos;
13374 nonfmtpos = fmtpos++;
13375 while (fmtcnt >= 0 &&
13376 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13377 fmtpos++;
13378 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013379 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013380 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013381 if (nonfmt == NULL)
13382 goto onError;
13383 r = _PyAccu_Accumulate(&acc, nonfmt);
13384 Py_DECREF(nonfmt);
13385 if (r)
13386 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013387 }
13388 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 /* Got a format specifier */
13390 int flags = 0;
13391 Py_ssize_t width = -1;
13392 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013393 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013394 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 int isnumok;
13396 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013397 void *pbuf = NULL;
13398 Py_ssize_t pindex, len;
13399 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013401 fmtpos++;
13402 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13403 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 Py_ssize_t keylen;
13405 PyObject *key;
13406 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013407
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 if (dict == NULL) {
13409 PyErr_SetString(PyExc_TypeError,
13410 "format requires a mapping");
13411 goto onError;
13412 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013413 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013415 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 /* Skip over balanced parentheses */
13417 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013418 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013422 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013424 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013425 if (fmtcnt < 0 || pcount > 0) {
13426 PyErr_SetString(PyExc_ValueError,
13427 "incomplete format key");
13428 goto onError;
13429 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013430 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013431 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013432 if (key == NULL)
13433 goto onError;
13434 if (args_owned) {
13435 Py_DECREF(args);
13436 args_owned = 0;
13437 }
13438 args = PyObject_GetItem(dict, key);
13439 Py_DECREF(key);
13440 if (args == NULL) {
13441 goto onError;
13442 }
13443 args_owned = 1;
13444 arglen = -1;
13445 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013446 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013448 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 case '-': flags |= F_LJUST; continue;
13450 case '+': flags |= F_SIGN; continue;
13451 case ' ': flags |= F_BLANK; continue;
13452 case '#': flags |= F_ALT; continue;
13453 case '0': flags |= F_ZERO; continue;
13454 }
13455 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013456 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 if (c == '*') {
13458 v = getnextarg(args, arglen, &argidx);
13459 if (v == NULL)
13460 goto onError;
13461 if (!PyLong_Check(v)) {
13462 PyErr_SetString(PyExc_TypeError,
13463 "* wants int");
13464 goto onError;
13465 }
13466 width = PyLong_AsLong(v);
13467 if (width == -1 && PyErr_Occurred())
13468 goto onError;
13469 if (width < 0) {
13470 flags |= F_LJUST;
13471 width = -width;
13472 }
13473 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013474 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 }
13476 else if (c >= '0' && c <= '9') {
13477 width = c - '0';
13478 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013479 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013480 if (c < '0' || c > '9')
13481 break;
13482 if ((width*10) / 10 != width) {
13483 PyErr_SetString(PyExc_ValueError,
13484 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013485 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013486 }
13487 width = width*10 + (c - '0');
13488 }
13489 }
13490 if (c == '.') {
13491 prec = 0;
13492 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013493 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013494 if (c == '*') {
13495 v = getnextarg(args, arglen, &argidx);
13496 if (v == NULL)
13497 goto onError;
13498 if (!PyLong_Check(v)) {
13499 PyErr_SetString(PyExc_TypeError,
13500 "* wants int");
13501 goto onError;
13502 }
13503 prec = PyLong_AsLong(v);
13504 if (prec == -1 && PyErr_Occurred())
13505 goto onError;
13506 if (prec < 0)
13507 prec = 0;
13508 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013509 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 }
13511 else if (c >= '0' && c <= '9') {
13512 prec = c - '0';
13513 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013514 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 if (c < '0' || c > '9')
13516 break;
13517 if ((prec*10) / 10 != prec) {
13518 PyErr_SetString(PyExc_ValueError,
13519 "prec too big");
13520 goto onError;
13521 }
13522 prec = prec*10 + (c - '0');
13523 }
13524 }
13525 } /* prec */
13526 if (fmtcnt >= 0) {
13527 if (c == 'h' || c == 'l' || c == 'L') {
13528 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013529 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013530 }
13531 }
13532 if (fmtcnt < 0) {
13533 PyErr_SetString(PyExc_ValueError,
13534 "incomplete format");
13535 goto onError;
13536 }
13537 if (c != '%') {
13538 v = getnextarg(args, arglen, &argidx);
13539 if (v == NULL)
13540 goto onError;
13541 }
13542 sign = 0;
13543 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013544 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 switch (c) {
13546
13547 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013548 _PyAccu_Accumulate(&acc, percent);
13549 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013550
13551 case 's':
13552 case 'r':
13553 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013554 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 temp = v;
13556 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013557 }
13558 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013559 if (c == 's')
13560 temp = PyObject_Str(v);
13561 else if (c == 'r')
13562 temp = PyObject_Repr(v);
13563 else
13564 temp = PyObject_ASCII(v);
13565 if (temp == NULL)
13566 goto onError;
13567 if (PyUnicode_Check(temp))
13568 /* nothing to do */;
13569 else {
13570 Py_DECREF(temp);
13571 PyErr_SetString(PyExc_TypeError,
13572 "%s argument has non-string str()");
13573 goto onError;
13574 }
13575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013576 if (PyUnicode_READY(temp) == -1) {
13577 Py_CLEAR(temp);
13578 goto onError;
13579 }
13580 pbuf = PyUnicode_DATA(temp);
13581 kind = PyUnicode_KIND(temp);
13582 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 if (prec >= 0 && len > prec)
13584 len = prec;
13585 break;
13586
13587 case 'i':
13588 case 'd':
13589 case 'u':
13590 case 'o':
13591 case 'x':
13592 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013593 isnumok = 0;
13594 if (PyNumber_Check(v)) {
13595 PyObject *iobj=NULL;
13596
13597 if (PyLong_Check(v)) {
13598 iobj = v;
13599 Py_INCREF(iobj);
13600 }
13601 else {
13602 iobj = PyNumber_Long(v);
13603 }
13604 if (iobj!=NULL) {
13605 if (PyLong_Check(iobj)) {
13606 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013607 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 Py_DECREF(iobj);
13609 if (!temp)
13610 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611 if (PyUnicode_READY(temp) == -1) {
13612 Py_CLEAR(temp);
13613 goto onError;
13614 }
13615 pbuf = PyUnicode_DATA(temp);
13616 kind = PyUnicode_KIND(temp);
13617 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 sign = 1;
13619 }
13620 else {
13621 Py_DECREF(iobj);
13622 }
13623 }
13624 }
13625 if (!isnumok) {
13626 PyErr_Format(PyExc_TypeError,
13627 "%%%c format: a number is required, "
13628 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13629 goto onError;
13630 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013631 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013632 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013633 fillobj = zero;
13634 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 break;
13636
13637 case 'e':
13638 case 'E':
13639 case 'f':
13640 case 'F':
13641 case 'g':
13642 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013643 temp = formatfloat(v, flags, prec, c);
13644 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013645 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013646 if (PyUnicode_READY(temp) == -1) {
13647 Py_CLEAR(temp);
13648 goto onError;
13649 }
13650 pbuf = PyUnicode_DATA(temp);
13651 kind = PyUnicode_KIND(temp);
13652 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013654 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013655 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013656 fillobj = zero;
13657 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013658 break;
13659
13660 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013661 {
13662 Py_UCS4 ch = formatchar(v);
13663 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013665 temp = _PyUnicode_FromUCS4(&ch, 1);
13666 if (temp == NULL)
13667 goto onError;
13668 pbuf = PyUnicode_DATA(temp);
13669 kind = PyUnicode_KIND(temp);
13670 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013672 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013673
13674 default:
13675 PyErr_Format(PyExc_ValueError,
13676 "unsupported format character '%c' (0x%x) "
13677 "at index %zd",
13678 (31<=c && c<=126) ? (char)c : '?',
13679 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013680 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 goto onError;
13682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013683 /* pbuf is initialized here. */
13684 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013685 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013686 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13687 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013689 pindex++;
13690 }
13691 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13692 signobj = plus;
13693 len--;
13694 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013695 }
13696 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013697 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013698 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013699 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013700 else
13701 sign = 0;
13702 }
13703 if (width < len)
13704 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013706 if (fill != ' ') {
13707 assert(signobj != NULL);
13708 if (_PyAccu_Accumulate(&acc, signobj))
13709 goto onError;
13710 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013711 if (width > len)
13712 width--;
13713 }
13714 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013715 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013716 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013718 second = get_latin1_char(
13719 PyUnicode_READ(kind, pbuf, pindex + 1));
13720 pindex += 2;
13721 if (second == NULL ||
13722 _PyAccu_Accumulate(&acc, zero) ||
13723 _PyAccu_Accumulate(&acc, second))
13724 goto onError;
13725 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 width -= 2;
13728 if (width < 0)
13729 width = 0;
13730 len -= 2;
13731 }
13732 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013733 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013734 if (repeat_accumulate(&acc, fillobj, width - len))
13735 goto onError;
13736 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013737 }
13738 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013739 if (sign) {
13740 assert(signobj != NULL);
13741 if (_PyAccu_Accumulate(&acc, signobj))
13742 goto onError;
13743 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013744 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13746 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013747 second = get_latin1_char(
13748 PyUnicode_READ(kind, pbuf, pindex + 1));
13749 pindex += 2;
13750 if (second == NULL ||
13751 _PyAccu_Accumulate(&acc, zero) ||
13752 _PyAccu_Accumulate(&acc, second))
13753 goto onError;
13754 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013755 }
13756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013757 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013758 if (temp != NULL) {
13759 assert(pbuf == PyUnicode_DATA(temp));
13760 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013761 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013762 else {
13763 const char *p = (const char *) pbuf;
13764 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013765 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013766 v = PyUnicode_FromKindAndData(kind, p, len);
13767 }
13768 if (v == NULL)
13769 goto onError;
13770 r = _PyAccu_Accumulate(&acc, v);
13771 Py_DECREF(v);
13772 if (r)
13773 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013774 if (width > len && repeat_accumulate(&acc, blank, width - len))
13775 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013776 if (dict && (argidx < arglen) && c != '%') {
13777 PyErr_SetString(PyExc_TypeError,
13778 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013779 goto onError;
13780 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013781 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013782 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013783 } /* until end */
13784 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013785 PyErr_SetString(PyExc_TypeError,
13786 "not all arguments converted during string formatting");
13787 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013788 }
13789
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013790 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013791 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013793 }
13794 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013795 Py_XDECREF(temp);
13796 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013797 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798
Benjamin Peterson29060642009-01-31 22:14:21 +000013799 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013800 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013801 Py_XDECREF(temp);
13802 Py_XDECREF(second);
13803 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806 }
13807 return NULL;
13808}
13809
Jeremy Hylton938ace62002-07-17 16:30:39 +000013810static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013811unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13812
Tim Peters6d6c1a32001-08-02 04:15:00 +000013813static PyObject *
13814unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13815{
Benjamin Peterson29060642009-01-31 22:14:21 +000013816 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013817 static char *kwlist[] = {"object", "encoding", "errors", 0};
13818 char *encoding = NULL;
13819 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013820
Benjamin Peterson14339b62009-01-31 16:36:08 +000013821 if (type != &PyUnicode_Type)
13822 return unicode_subtype_new(type, args, kwds);
13823 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013824 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013825 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013826 if (x == NULL) {
13827 Py_INCREF(unicode_empty);
13828 return unicode_empty;
13829 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013830 if (encoding == NULL && errors == NULL)
13831 return PyObject_Str(x);
13832 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013833 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013834}
13835
Guido van Rossume023fe02001-08-30 03:12:59 +000013836static PyObject *
13837unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13838{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013839 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013840 Py_ssize_t length, char_size;
13841 int share_wstr, share_utf8;
13842 unsigned int kind;
13843 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013844
Benjamin Peterson14339b62009-01-31 16:36:08 +000013845 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013846
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013847 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013848 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013849 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013850 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013851 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013852 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013853 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013854 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013855
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013856 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013857 if (self == NULL) {
13858 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013859 return NULL;
13860 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013861 kind = PyUnicode_KIND(unicode);
13862 length = PyUnicode_GET_LENGTH(unicode);
13863
13864 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013865#ifdef Py_DEBUG
13866 _PyUnicode_HASH(self) = -1;
13867#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013868 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013869#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013870 _PyUnicode_STATE(self).interned = 0;
13871 _PyUnicode_STATE(self).kind = kind;
13872 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013873 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013874 _PyUnicode_STATE(self).ready = 1;
13875 _PyUnicode_WSTR(self) = NULL;
13876 _PyUnicode_UTF8_LENGTH(self) = 0;
13877 _PyUnicode_UTF8(self) = NULL;
13878 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013879 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013880
13881 share_utf8 = 0;
13882 share_wstr = 0;
13883 if (kind == PyUnicode_1BYTE_KIND) {
13884 char_size = 1;
13885 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13886 share_utf8 = 1;
13887 }
13888 else if (kind == PyUnicode_2BYTE_KIND) {
13889 char_size = 2;
13890 if (sizeof(wchar_t) == 2)
13891 share_wstr = 1;
13892 }
13893 else {
13894 assert(kind == PyUnicode_4BYTE_KIND);
13895 char_size = 4;
13896 if (sizeof(wchar_t) == 4)
13897 share_wstr = 1;
13898 }
13899
13900 /* Ensure we won't overflow the length. */
13901 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13902 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013903 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013904 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013905 data = PyObject_MALLOC((length + 1) * char_size);
13906 if (data == NULL) {
13907 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013908 goto onError;
13909 }
13910
Victor Stinnerc3c74152011-10-02 20:39:55 +020013911 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013912 if (share_utf8) {
13913 _PyUnicode_UTF8_LENGTH(self) = length;
13914 _PyUnicode_UTF8(self) = data;
13915 }
13916 if (share_wstr) {
13917 _PyUnicode_WSTR_LENGTH(self) = length;
13918 _PyUnicode_WSTR(self) = (wchar_t *)data;
13919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013920
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013921 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013922 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013923 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013924#ifdef Py_DEBUG
13925 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13926#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013927 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013928 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013929
13930onError:
13931 Py_DECREF(unicode);
13932 Py_DECREF(self);
13933 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013934}
13935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013936PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013938\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013939Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013940encoding defaults to the current default string encoding.\n\
13941errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013942
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013943static PyObject *unicode_iter(PyObject *seq);
13944
Guido van Rossumd57fd912000-03-10 22:53:23 +000013945PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013946 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013947 "str", /* tp_name */
13948 sizeof(PyUnicodeObject), /* tp_size */
13949 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013950 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013951 (destructor)unicode_dealloc, /* tp_dealloc */
13952 0, /* tp_print */
13953 0, /* tp_getattr */
13954 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013955 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013956 unicode_repr, /* tp_repr */
13957 &unicode_as_number, /* tp_as_number */
13958 &unicode_as_sequence, /* tp_as_sequence */
13959 &unicode_as_mapping, /* tp_as_mapping */
13960 (hashfunc) unicode_hash, /* tp_hash*/
13961 0, /* tp_call*/
13962 (reprfunc) unicode_str, /* tp_str */
13963 PyObject_GenericGetAttr, /* tp_getattro */
13964 0, /* tp_setattro */
13965 0, /* tp_as_buffer */
13966 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013967 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013968 unicode_doc, /* tp_doc */
13969 0, /* tp_traverse */
13970 0, /* tp_clear */
13971 PyUnicode_RichCompare, /* tp_richcompare */
13972 0, /* tp_weaklistoffset */
13973 unicode_iter, /* tp_iter */
13974 0, /* tp_iternext */
13975 unicode_methods, /* tp_methods */
13976 0, /* tp_members */
13977 0, /* tp_getset */
13978 &PyBaseObject_Type, /* tp_base */
13979 0, /* tp_dict */
13980 0, /* tp_descr_get */
13981 0, /* tp_descr_set */
13982 0, /* tp_dictoffset */
13983 0, /* tp_init */
13984 0, /* tp_alloc */
13985 unicode_new, /* tp_new */
13986 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013987};
13988
13989/* Initialize the Unicode implementation */
13990
Victor Stinner3a50e702011-10-18 21:21:00 +020013991int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013992{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013993 int i;
13994
Thomas Wouters477c8d52006-05-27 19:21:47 +000013995 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013996 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013997 0x000A, /* LINE FEED */
13998 0x000D, /* CARRIAGE RETURN */
13999 0x001C, /* FILE SEPARATOR */
14000 0x001D, /* GROUP SEPARATOR */
14001 0x001E, /* RECORD SEPARATOR */
14002 0x0085, /* NEXT LINE */
14003 0x2028, /* LINE SEPARATOR */
14004 0x2029, /* PARAGRAPH SEPARATOR */
14005 };
14006
Fred Drakee4315f52000-05-09 19:53:39 +000014007 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014008 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014009 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014010 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014011 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014012
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014013 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014014 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014015 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014016 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014017
14018 /* initialize the linebreak bloom filter */
14019 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014020 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014021 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014022
14023 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014024
14025#ifdef HAVE_MBCS
14026 winver.dwOSVersionInfoSize = sizeof(winver);
14027 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14028 PyErr_SetFromWindowsErr(0);
14029 return -1;
14030 }
14031#endif
14032 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014033}
14034
14035/* Finalize the Unicode implementation */
14036
Christian Heimesa156e092008-02-16 07:38:31 +000014037int
14038PyUnicode_ClearFreeList(void)
14039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014040 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014041}
14042
Guido van Rossumd57fd912000-03-10 22:53:23 +000014043void
Thomas Wouters78890102000-07-22 19:25:51 +000014044_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014045{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014046 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014047
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014048 Py_XDECREF(unicode_empty);
14049 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014050
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014051 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014052 if (unicode_latin1[i]) {
14053 Py_DECREF(unicode_latin1[i]);
14054 unicode_latin1[i] = NULL;
14055 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014056 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014057 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014058 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014060
Walter Dörwald16807132007-05-25 13:52:07 +000014061void
14062PyUnicode_InternInPlace(PyObject **p)
14063{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014064 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014065 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014066#ifdef Py_DEBUG
14067 assert(s != NULL);
14068 assert(_PyUnicode_CHECK(s));
14069#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014070 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014071 return;
14072#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014073 /* If it's a subclass, we don't really know what putting
14074 it in the interned dict might do. */
14075 if (!PyUnicode_CheckExact(s))
14076 return;
14077 if (PyUnicode_CHECK_INTERNED(s))
14078 return;
14079 if (interned == NULL) {
14080 interned = PyDict_New();
14081 if (interned == NULL) {
14082 PyErr_Clear(); /* Don't leave an exception */
14083 return;
14084 }
14085 }
14086 /* It might be that the GetItem call fails even
14087 though the key is present in the dictionary,
14088 namely when this happens during a stack overflow. */
14089 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014090 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014091 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014092
Benjamin Peterson29060642009-01-31 22:14:21 +000014093 if (t) {
14094 Py_INCREF(t);
14095 Py_DECREF(*p);
14096 *p = t;
14097 return;
14098 }
Walter Dörwald16807132007-05-25 13:52:07 +000014099
Benjamin Peterson14339b62009-01-31 16:36:08 +000014100 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014101 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014102 PyErr_Clear();
14103 PyThreadState_GET()->recursion_critical = 0;
14104 return;
14105 }
14106 PyThreadState_GET()->recursion_critical = 0;
14107 /* The two references in interned are not counted by refcnt.
14108 The deallocator will take care of this */
14109 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014110 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014111}
14112
14113void
14114PyUnicode_InternImmortal(PyObject **p)
14115{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014116 PyUnicode_InternInPlace(p);
14117 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014118 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014119 Py_INCREF(*p);
14120 }
Walter Dörwald16807132007-05-25 13:52:07 +000014121}
14122
14123PyObject *
14124PyUnicode_InternFromString(const char *cp)
14125{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014126 PyObject *s = PyUnicode_FromString(cp);
14127 if (s == NULL)
14128 return NULL;
14129 PyUnicode_InternInPlace(&s);
14130 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014131}
14132
Alexander Belopolsky40018472011-02-26 01:02:56 +000014133void
14134_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014135{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014137 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014138 Py_ssize_t i, n;
14139 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014140
Benjamin Peterson14339b62009-01-31 16:36:08 +000014141 if (interned == NULL || !PyDict_Check(interned))
14142 return;
14143 keys = PyDict_Keys(interned);
14144 if (keys == NULL || !PyList_Check(keys)) {
14145 PyErr_Clear();
14146 return;
14147 }
Walter Dörwald16807132007-05-25 13:52:07 +000014148
Benjamin Peterson14339b62009-01-31 16:36:08 +000014149 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14150 detector, interned unicode strings are not forcibly deallocated;
14151 rather, we give them their stolen references back, and then clear
14152 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014153
Benjamin Peterson14339b62009-01-31 16:36:08 +000014154 n = PyList_GET_SIZE(keys);
14155 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014156 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014157 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014158 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014159 if (PyUnicode_READY(s) == -1) {
14160 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014161 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014163 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 case SSTATE_NOT_INTERNED:
14165 /* XXX Shouldn't happen */
14166 break;
14167 case SSTATE_INTERNED_IMMORTAL:
14168 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014169 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 break;
14171 case SSTATE_INTERNED_MORTAL:
14172 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014173 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014174 break;
14175 default:
14176 Py_FatalError("Inconsistent interned string state.");
14177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014178 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014179 }
14180 fprintf(stderr, "total size of all interned strings: "
14181 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14182 "mortal/immortal\n", mortal_size, immortal_size);
14183 Py_DECREF(keys);
14184 PyDict_Clear(interned);
14185 Py_DECREF(interned);
14186 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014187}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014188
14189
14190/********************* Unicode Iterator **************************/
14191
14192typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 PyObject_HEAD
14194 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014195 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014196} unicodeiterobject;
14197
14198static void
14199unicodeiter_dealloc(unicodeiterobject *it)
14200{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014201 _PyObject_GC_UNTRACK(it);
14202 Py_XDECREF(it->it_seq);
14203 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014204}
14205
14206static int
14207unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14208{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014209 Py_VISIT(it->it_seq);
14210 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014211}
14212
14213static PyObject *
14214unicodeiter_next(unicodeiterobject *it)
14215{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014216 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014217
Benjamin Peterson14339b62009-01-31 16:36:08 +000014218 assert(it != NULL);
14219 seq = it->it_seq;
14220 if (seq == NULL)
14221 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014222 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014224 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14225 int kind = PyUnicode_KIND(seq);
14226 void *data = PyUnicode_DATA(seq);
14227 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14228 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014229 if (item != NULL)
14230 ++it->it_index;
14231 return item;
14232 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014233
Benjamin Peterson14339b62009-01-31 16:36:08 +000014234 Py_DECREF(seq);
14235 it->it_seq = NULL;
14236 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014237}
14238
14239static PyObject *
14240unicodeiter_len(unicodeiterobject *it)
14241{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014242 Py_ssize_t len = 0;
14243 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014244 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014245 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014246}
14247
14248PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14249
14250static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014251 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014252 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014253 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014254};
14255
14256PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014257 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14258 "str_iterator", /* tp_name */
14259 sizeof(unicodeiterobject), /* tp_basicsize */
14260 0, /* tp_itemsize */
14261 /* methods */
14262 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14263 0, /* tp_print */
14264 0, /* tp_getattr */
14265 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014266 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014267 0, /* tp_repr */
14268 0, /* tp_as_number */
14269 0, /* tp_as_sequence */
14270 0, /* tp_as_mapping */
14271 0, /* tp_hash */
14272 0, /* tp_call */
14273 0, /* tp_str */
14274 PyObject_GenericGetAttr, /* tp_getattro */
14275 0, /* tp_setattro */
14276 0, /* tp_as_buffer */
14277 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14278 0, /* tp_doc */
14279 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14280 0, /* tp_clear */
14281 0, /* tp_richcompare */
14282 0, /* tp_weaklistoffset */
14283 PyObject_SelfIter, /* tp_iter */
14284 (iternextfunc)unicodeiter_next, /* tp_iternext */
14285 unicodeiter_methods, /* tp_methods */
14286 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014287};
14288
14289static PyObject *
14290unicode_iter(PyObject *seq)
14291{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014292 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014293
Benjamin Peterson14339b62009-01-31 16:36:08 +000014294 if (!PyUnicode_Check(seq)) {
14295 PyErr_BadInternalCall();
14296 return NULL;
14297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014298 if (PyUnicode_READY(seq) == -1)
14299 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014300 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14301 if (it == NULL)
14302 return NULL;
14303 it->it_index = 0;
14304 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014305 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 _PyObject_GC_TRACK(it);
14307 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014308}
14309
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014310
14311size_t
14312Py_UNICODE_strlen(const Py_UNICODE *u)
14313{
14314 int res = 0;
14315 while(*u++)
14316 res++;
14317 return res;
14318}
14319
14320Py_UNICODE*
14321Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14322{
14323 Py_UNICODE *u = s1;
14324 while ((*u++ = *s2++));
14325 return s1;
14326}
14327
14328Py_UNICODE*
14329Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14330{
14331 Py_UNICODE *u = s1;
14332 while ((*u++ = *s2++))
14333 if (n-- == 0)
14334 break;
14335 return s1;
14336}
14337
14338Py_UNICODE*
14339Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14340{
14341 Py_UNICODE *u1 = s1;
14342 u1 += Py_UNICODE_strlen(u1);
14343 Py_UNICODE_strcpy(u1, s2);
14344 return s1;
14345}
14346
14347int
14348Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14349{
14350 while (*s1 && *s2 && *s1 == *s2)
14351 s1++, s2++;
14352 if (*s1 && *s2)
14353 return (*s1 < *s2) ? -1 : +1;
14354 if (*s1)
14355 return 1;
14356 if (*s2)
14357 return -1;
14358 return 0;
14359}
14360
14361int
14362Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14363{
14364 register Py_UNICODE u1, u2;
14365 for (; n != 0; n--) {
14366 u1 = *s1;
14367 u2 = *s2;
14368 if (u1 != u2)
14369 return (u1 < u2) ? -1 : +1;
14370 if (u1 == '\0')
14371 return 0;
14372 s1++;
14373 s2++;
14374 }
14375 return 0;
14376}
14377
14378Py_UNICODE*
14379Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14380{
14381 const Py_UNICODE *p;
14382 for (p = s; *p; p++)
14383 if (*p == c)
14384 return (Py_UNICODE*)p;
14385 return NULL;
14386}
14387
14388Py_UNICODE*
14389Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14390{
14391 const Py_UNICODE *p;
14392 p = s + Py_UNICODE_strlen(s);
14393 while (p != s) {
14394 p--;
14395 if (*p == c)
14396 return (Py_UNICODE*)p;
14397 }
14398 return NULL;
14399}
Victor Stinner331ea922010-08-10 16:37:20 +000014400
Victor Stinner71133ff2010-09-01 23:43:53 +000014401Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014402PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014403{
Victor Stinner577db2c2011-10-11 22:12:48 +020014404 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014405 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014407 if (!PyUnicode_Check(unicode)) {
14408 PyErr_BadArgument();
14409 return NULL;
14410 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014411 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014412 if (u == NULL)
14413 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014414 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014415 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014416 PyErr_NoMemory();
14417 return NULL;
14418 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014419 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014420 size *= sizeof(Py_UNICODE);
14421 copy = PyMem_Malloc(size);
14422 if (copy == NULL) {
14423 PyErr_NoMemory();
14424 return NULL;
14425 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014426 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014427 return copy;
14428}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014429
Georg Brandl66c221e2010-10-14 07:04:07 +000014430/* A _string module, to export formatter_parser and formatter_field_name_split
14431 to the string.Formatter class implemented in Python. */
14432
14433static PyMethodDef _string_methods[] = {
14434 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14435 METH_O, PyDoc_STR("split the argument as a field name")},
14436 {"formatter_parser", (PyCFunction) formatter_parser,
14437 METH_O, PyDoc_STR("parse the argument as a format string")},
14438 {NULL, NULL}
14439};
14440
14441static struct PyModuleDef _string_module = {
14442 PyModuleDef_HEAD_INIT,
14443 "_string",
14444 PyDoc_STR("string helper module"),
14445 0,
14446 _string_methods,
14447 NULL,
14448 NULL,
14449 NULL,
14450 NULL
14451};
14452
14453PyMODINIT_FUNC
14454PyInit__string(void)
14455{
14456 return PyModule_Create(&_string_module);
14457}
14458
14459
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014460#ifdef __cplusplus
14461}
14462#endif