blob: 648d9a066130e7a5bca776b5e5e5ab3b597e5a3f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
Benjamin Petersonbac79492012-01-14 13:34:47 -05001266 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001267 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001268 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001747 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1748 strlen(id->string),
1749 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750 if (!id->object)
1751 return NULL;
1752 PyUnicode_InternInPlace(&id->object);
1753 assert(!id->next);
1754 id->next = static_strings;
1755 static_strings = id;
1756 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001757 return id->object;
1758}
1759
1760void
1761_PyUnicode_ClearStaticStrings()
1762{
1763 _Py_Identifier *i;
1764 for (i = static_strings; i; i = i->next) {
1765 Py_DECREF(i->object);
1766 i->object = NULL;
1767 i->next = NULL;
1768 }
1769}
1770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771/* Internal function, don't check maximum character */
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001775{
Victor Stinner785938e2011-12-11 20:09:03 +01001776 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001777 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001778#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001781 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001782 }
Victor Stinner785938e2011-12-11 20:09:03 +01001783 unicode = PyUnicode_New(size, 127);
1784 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001785 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001786 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1787 assert(_PyUnicode_CheckConsistency(unicode, 1));
1788 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001789}
1790
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001791static Py_UCS4
1792kind_maxchar_limit(unsigned int kind)
1793{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001794 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001795 case PyUnicode_1BYTE_KIND:
1796 return 0x80;
1797 case PyUnicode_2BYTE_KIND:
1798 return 0x100;
1799 case PyUnicode_4BYTE_KIND:
1800 return 0x10000;
1801 default:
1802 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001803 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804 }
1805}
1806
Victor Stinner702c7342011-10-05 13:50:52 +02001807static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001808_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001811 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 if (size == 0) {
1814 Py_INCREF(unicode_empty);
1815 return unicode_empty;
1816 }
1817 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001818 if (size == 1)
1819 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
1825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001826 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001828}
1829
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830static PyObject*
1831_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832{
1833 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 if (size == 0) {
1837 Py_INCREF(unicode_empty);
1838 return unicode_empty;
1839 }
1840 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001841 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001842 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 else {
1851 _PyUnicode_CONVERT_BYTES(
1852 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 return res;
1856}
1857
Victor Stinnere57b1c02011-09-28 22:20:48 +02001858static PyObject*
1859_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860{
1861 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 if (size == 0) {
1865 Py_INCREF(unicode_empty);
1866 return unicode_empty;
1867 }
1868 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001869 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 return get_latin1_char((unsigned char)u[0]);
1871
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001873 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 if (!res)
1875 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001876 if (max_char < 256)
1877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1878 PyUnicode_1BYTE_DATA(res));
1879 else if (max_char < 0x10000)
1880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1881 PyUnicode_2BYTE_DATA(res));
1882 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001884 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 return res;
1886}
1887
1888PyObject*
1889PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1890{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001891 if (size < 0) {
1892 PyErr_SetString(PyExc_ValueError, "size must be positive");
1893 return NULL;
1894 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001895 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001897 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 PyErr_SetString(PyExc_SystemError, "invalid kind");
1904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906}
1907
Victor Stinner25a4b292011-10-06 12:31:55 +02001908/* Ensure that a string uses the most efficient storage, if it is not the
1909 case: create a new string with of the right kind. Write NULL into *p_unicode
1910 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001911static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001912unicode_adjust_maxchar(PyObject **p_unicode)
1913{
1914 PyObject *unicode, *copy;
1915 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001916 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001917 unsigned int kind;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921 assert(PyUnicode_IS_READY(unicode));
1922 if (PyUnicode_IS_ASCII(unicode))
1923 return;
1924
1925 len = PyUnicode_GET_LENGTH(unicode);
1926 kind = PyUnicode_KIND(unicode);
1927 if (kind == PyUnicode_1BYTE_KIND) {
1928 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + len);
1930 if (max_char >= 128)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else if (kind == PyUnicode_2BYTE_KIND) {
1934 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + len);
1936 if (max_char >= 256)
1937 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001938 }
1939 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs4lib_find_max_char(u, u + len);
1943 if (max_char >= 0x10000)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 copy = PyUnicode_New(len, max_char);
1947 copy_characters(copy, 0, unicode, 0, len);
1948 Py_DECREF(unicode);
1949 *p_unicode = copy;
1950}
1951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001953_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954{
Victor Stinner87af4f22011-11-21 23:03:47 +01001955 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001957
Victor Stinner034f6cf2011-09-30 02:26:44 +02001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05001962 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001963 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 length = PyUnicode_GET_LENGTH(unicode);
1966 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 if (!copy)
1968 return NULL;
1969 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1970
Victor Stinner87af4f22011-11-21 23:03:47 +01001971 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1972 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001975}
1976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978/* Widen Unicode objects to larger buffers. Don't write terminating null
1979 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
1981void*
1982_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1983{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001984 Py_ssize_t len;
1985 void *result;
1986 unsigned int skind;
1987
Benjamin Petersonbac79492012-01-14 13:34:47 -05001988 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02001989 return NULL;
1990
1991 len = PyUnicode_GET_LENGTH(s);
1992 skind = PyUnicode_KIND(s);
1993 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001994 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return NULL;
1996 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001997 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 case PyUnicode_2BYTE_KIND:
1999 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2000 if (!result)
2001 return PyErr_NoMemory();
2002 assert(skind == PyUnicode_1BYTE_KIND);
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS1, Py_UCS2,
2005 PyUnicode_1BYTE_DATA(s),
2006 PyUnicode_1BYTE_DATA(s) + len,
2007 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_4BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 if (skind == PyUnicode_2BYTE_KIND) {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS4,
2016 PyUnicode_2BYTE_DATA(s),
2017 PyUnicode_2BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 else {
2021 assert(skind == PyUnicode_1BYTE_KIND);
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS1, Py_UCS4,
2024 PyUnicode_1BYTE_DATA(s),
2025 PyUnicode_1BYTE_DATA(s) + len,
2026 result);
2027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 default:
2030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 }
Victor Stinner01698042011-10-04 00:04:26 +02002032 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return NULL;
2034}
2035
2036static Py_UCS4*
2037as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2038 int copy_null)
2039{
2040 int kind;
2041 void *data;
2042 Py_ssize_t len, targetlen;
2043 if (PyUnicode_READY(string) == -1)
2044 return NULL;
2045 kind = PyUnicode_KIND(string);
2046 data = PyUnicode_DATA(string);
2047 len = PyUnicode_GET_LENGTH(string);
2048 targetlen = len;
2049 if (copy_null)
2050 targetlen++;
2051 if (!target) {
2052 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2053 PyErr_NoMemory();
2054 return NULL;
2055 }
2056 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2057 if (!target) {
2058 PyErr_NoMemory();
2059 return NULL;
2060 }
2061 }
2062 else {
2063 if (targetsize < targetlen) {
2064 PyErr_Format(PyExc_SystemError,
2065 "string is longer than the buffer");
2066 if (copy_null && 0 < targetsize)
2067 target[0] = 0;
2068 return NULL;
2069 }
2070 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002071 if (kind == PyUnicode_1BYTE_KIND) {
2072 Py_UCS1 *start = (Py_UCS1 *) data;
2073 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002075 else if (kind == PyUnicode_2BYTE_KIND) {
2076 Py_UCS2 *start = (Py_UCS2 *) data;
2077 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2078 }
2079 else {
2080 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (copy_null)
2084 target[len] = 0;
2085 return target;
2086}
2087
2088Py_UCS4*
2089PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002092 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
2096 return as_ucs4(string, target, targetsize, copy_null);
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4Copy(PyObject *string)
2101{
2102 return as_ucs4(string, NULL, 0, 1);
2103}
2104
2105#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002106
Alexander Belopolsky40018472011-02-26 01:02:56 +00002107PyObject *
2108PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002111 if (size == 0) {
2112 Py_INCREF(unicode_empty);
2113 return unicode_empty;
2114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_BadInternalCall();
2116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == -1) {
2120 size = wcslen(w);
2121 }
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002127
Walter Dörwald346737f2007-05-31 10:44:43 +00002128static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002129makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2130 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 *fmt++ = '%';
2133 if (width) {
2134 if (zeropad)
2135 *fmt++ = '0';
2136 fmt += sprintf(fmt, "%d", width);
2137 }
2138 if (precision)
2139 fmt += sprintf(fmt, ".%d", precision);
2140 if (longflag)
2141 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142 else if (longlongflag) {
2143 /* longlongflag should only ever be nonzero on machines with
2144 HAVE_LONG_LONG defined */
2145#ifdef HAVE_LONG_LONG
2146 char *f = PY_FORMAT_LONG_LONG;
2147 while (*f)
2148 *fmt++ = *f++;
2149#else
2150 /* we shouldn't ever get here */
2151 assert(0);
2152 *fmt++ = 'l';
2153#endif
2154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 else if (size_tflag) {
2156 char *f = PY_FORMAT_SIZE_T;
2157 while (*f)
2158 *fmt++ = *f++;
2159 }
2160 *fmt++ = c;
2161 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002162}
2163
Victor Stinner96865452011-03-01 23:44:09 +00002164/* helper for PyUnicode_FromFormatV() */
2165
2166static const char*
2167parse_format_flags(const char *f,
2168 int *p_width, int *p_precision,
2169 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2170{
2171 int width, precision, longflag, longlongflag, size_tflag;
2172
2173 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2174 f++;
2175 width = 0;
2176 while (Py_ISDIGIT((unsigned)*f))
2177 width = (width*10) + *f++ - '0';
2178 precision = 0;
2179 if (*f == '.') {
2180 f++;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 precision = (precision*10) + *f++ - '0';
2183 if (*f == '%') {
2184 /* "%.3%s" => f points to "3" */
2185 f--;
2186 }
2187 }
2188 if (*f == '\0') {
2189 /* bogus format "%.1" => go backward, f points to "1" */
2190 f--;
2191 }
2192 if (p_width != NULL)
2193 *p_width = width;
2194 if (p_precision != NULL)
2195 *p_precision = precision;
2196
2197 /* Handle %ld, %lu, %lld and %llu. */
2198 longflag = 0;
2199 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002200 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002201
2202 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longflag = 1;
2205 ++f;
2206 }
2207#ifdef HAVE_LONG_LONG
2208 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longlongflag = 1;
2211 f += 2;
2212 }
2213#endif
2214 }
2215 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 size_tflag = 1;
2218 ++f;
2219 }
2220 if (p_longflag != NULL)
2221 *p_longflag = longflag;
2222 if (p_longlongflag != NULL)
2223 *p_longlongflag = longlongflag;
2224 if (p_size_tflag != NULL)
2225 *p_size_tflag = size_tflag;
2226 return f;
2227}
2228
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229/* maximum number of characters required for output of %ld. 21 characters
2230 allows for 64-bit integers (in decimal) and an optional sign. */
2231#define MAX_LONG_CHARS 21
2232/* maximum number of characters required for output of %lld.
2233 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2234 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2235#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormatV(const char *format, va_list vargs)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_list count;
2241 Py_ssize_t callcount = 0;
2242 PyObject **callresults = NULL;
2243 PyObject **callresult = NULL;
2244 Py_ssize_t n = 0;
2245 int width = 0;
2246 int precision = 0;
2247 int zeropad;
2248 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002249 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002251 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2253 Py_UCS4 argmaxchar;
2254 Py_ssize_t numbersize = 0;
2255 char *numberresults = NULL;
2256 char *numberresult = NULL;
2257 Py_ssize_t i;
2258 int kind;
2259 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002260
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002261 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002262 /* step 1: count the number of %S/%R/%A/%s format specifications
2263 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2264 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002266 * also estimate a upper bound for all the number formats in the string,
2267 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 for (f = format; *f; f++) {
2270 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002271 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2273 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2274 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2275 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278#ifdef HAVE_LONG_LONG
2279 if (longlongflag) {
2280 if (width < MAX_LONG_LONG_CHARS)
2281 width = MAX_LONG_LONG_CHARS;
2282 }
2283 else
2284#endif
2285 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2286 including sign. Decimal takes the most space. This
2287 isn't enough for octal. If a width is specified we
2288 need more (which we allocate later). */
2289 if (width < MAX_LONG_CHARS)
2290 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291
2292 /* account for the size + '\0' to separate numbers
2293 inside of the numberresults buffer */
2294 numbersize += (width + 1);
2295 }
2296 }
2297 else if ((unsigned char)*f > 127) {
2298 PyErr_Format(PyExc_ValueError,
2299 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2300 "string, got a non-ASCII byte: 0x%02x",
2301 (unsigned char)*f);
2302 return NULL;
2303 }
2304 }
2305 /* step 2: allocate memory for the results of
2306 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2307 if (callcount) {
2308 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2309 if (!callresults) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313 callresult = callresults;
2314 }
2315 /* step 2.5: allocate memory for the results of formating numbers */
2316 if (numbersize) {
2317 numberresults = PyObject_Malloc(numbersize);
2318 if (!numberresults) {
2319 PyErr_NoMemory();
2320 goto fail;
2321 }
2322 numberresult = numberresults;
2323 }
2324
2325 /* step 3: format numbers and figure out how large a buffer we need */
2326 for (f = format; *f; f++) {
2327 if (*f == '%') {
2328 const char* p;
2329 int longflag;
2330 int longlongflag;
2331 int size_tflag;
2332 int numprinted;
2333
2334 p = f;
2335 zeropad = (f[1] == '0');
2336 f = parse_format_flags(f, &width, &precision,
2337 &longflag, &longlongflag, &size_tflag);
2338 switch (*f) {
2339 case 'c':
2340 {
2341 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002342 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 n++;
2344 break;
2345 }
2346 case '%':
2347 n++;
2348 break;
2349 case 'i':
2350 case 'd':
2351 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2352 width, precision, *f);
2353 if (longflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, long));
2356#ifdef HAVE_LONG_LONG
2357 else if (longlongflag)
2358 numprinted = sprintf(numberresult, fmt,
2359 va_arg(count, PY_LONG_LONG));
2360#endif
2361 else if (size_tflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, Py_ssize_t));
2364 else
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, int));
2367 n += numprinted;
2368 /* advance by +1 to skip over the '\0' */
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
2374 break;
2375 case 'u':
2376 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2377 width, precision, 'u');
2378 if (longflag)
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, unsigned long));
2381#ifdef HAVE_LONG_LONG
2382 else if (longlongflag)
2383 numprinted = sprintf(numberresult, fmt,
2384 va_arg(count, unsigned PY_LONG_LONG));
2385#endif
2386 else if (size_tflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, size_t));
2389 else
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned int));
2392 n += numprinted;
2393 numberresult += (numprinted + 1);
2394 assert(*(numberresult - 1) == '\0');
2395 assert(*(numberresult - 2) != '\0');
2396 assert(numprinted >= 0);
2397 assert(numberresult <= numberresults + numbersize);
2398 break;
2399 case 'x':
2400 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2401 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2402 n += numprinted;
2403 numberresult += (numprinted + 1);
2404 assert(*(numberresult - 1) == '\0');
2405 assert(*(numberresult - 2) != '\0');
2406 assert(numprinted >= 0);
2407 assert(numberresult <= numberresults + numbersize);
2408 break;
2409 case 'p':
2410 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2411 /* %p is ill-defined: ensure leading 0x. */
2412 if (numberresult[1] == 'X')
2413 numberresult[1] = 'x';
2414 else if (numberresult[1] != 'x') {
2415 memmove(numberresult + 2, numberresult,
2416 strlen(numberresult) + 1);
2417 numberresult[0] = '0';
2418 numberresult[1] = 'x';
2419 numprinted += 2;
2420 }
2421 n += numprinted;
2422 numberresult += (numprinted + 1);
2423 assert(*(numberresult - 1) == '\0');
2424 assert(*(numberresult - 2) != '\0');
2425 assert(numprinted >= 0);
2426 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 break;
2428 case 's':
2429 {
2430 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002431 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002432 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002433 if (!str)
2434 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 /* since PyUnicode_DecodeUTF8 returns already flexible
2436 unicode objects, there is no need to call ready on them */
2437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002440 /* Remember the str and switch to the next slot */
2441 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 }
2444 case 'U':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002447 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (PyUnicode_READY(obj) == -1)
2449 goto fail;
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'V':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002459 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (PyUnicode_READY(obj) == -1)
2464 goto fail;
2465 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002466 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 *callresult++ = NULL;
2469 }
2470 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002471 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 if (!str_obj)
2473 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002474 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002475 Py_DECREF(str_obj);
2476 goto fail;
2477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = str_obj;
2482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 break;
2484 }
2485 case 'S':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *str;
2489 assert(obj);
2490 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002491 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002493 if (PyUnicode_READY(str) == -1) {
2494 Py_DECREF(str);
2495 goto fail;
2496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002498 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 /* Remember the str and switch to the next slot */
2501 *callresult++ = str;
2502 break;
2503 }
2504 case 'R':
2505 {
2506 PyObject *obj = va_arg(count, PyObject *);
2507 PyObject *repr;
2508 assert(obj);
2509 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002510 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002512 if (PyUnicode_READY(repr) == -1) {
2513 Py_DECREF(repr);
2514 goto fail;
2515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002517 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 /* Remember the repr and switch to the next slot */
2520 *callresult++ = repr;
2521 break;
2522 }
2523 case 'A':
2524 {
2525 PyObject *obj = va_arg(count, PyObject *);
2526 PyObject *ascii;
2527 assert(obj);
2528 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002529 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002531 if (PyUnicode_READY(ascii) == -1) {
2532 Py_DECREF(ascii);
2533 goto fail;
2534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002535 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002536 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 /* Remember the repr and switch to the next slot */
2539 *callresult++ = ascii;
2540 break;
2541 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 default:
2543 /* if we stumble upon an unknown
2544 formatting code, copy the rest of
2545 the format string to the output
2546 string. (we cannot just skip the
2547 code, since there's no way to know
2548 what's in the argument list) */
2549 n += strlen(p);
2550 goto expand;
2551 }
2552 } else
2553 n++;
2554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002555 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 we don't have to resize the string.
2559 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002560 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 if (!string)
2562 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 kind = PyUnicode_KIND(string);
2564 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002570 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002571
2572 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2574 /* checking for == because the last argument could be a empty
2575 string, which causes i to point to end, the assert at the end of
2576 the loop */
2577 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002578
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 switch (*f) {
2580 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002581 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 const int ordinal = va_arg(vargs, int);
2583 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002585 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002586 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 case 'p':
2591 /* unused, since we already have the result */
2592 if (*f == 'p')
2593 (void) va_arg(vargs, void *);
2594 else
2595 (void) va_arg(vargs, int);
2596 /* extract the result from numberresults and append. */
2597 for (; *numberresult; ++i, ++numberresult)
2598 PyUnicode_WRITE(kind, data, i, *numberresult);
2599 /* skip over the separating '\0' */
2600 assert(*numberresult == '\0');
2601 numberresult++;
2602 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 break;
2604 case 's':
2605 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002606 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002608 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 size = PyUnicode_GET_LENGTH(*callresult);
2610 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002611 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002613 /* We're done with the unicode()/repr() => forget it */
2614 Py_DECREF(*callresult);
2615 /* switch to next unicode()/repr() result */
2616 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002617 break;
2618 }
2619 case 'U':
2620 {
2621 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 Py_ssize_t size;
2623 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2624 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 break;
2628 }
2629 case 'V':
2630 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 size = PyUnicode_GET_LENGTH(obj);
2636 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002637 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 size = PyUnicode_GET_LENGTH(*callresult);
2641 assert(PyUnicode_KIND(*callresult) <=
2642 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002643 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002645 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002647 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 break;
2649 }
2650 case 'S':
2651 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002652 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002654 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 /* unused, since we already have the result */
2656 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002658 copy_characters(string, i, *callresult, 0, size);
2659 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 /* We're done with the unicode()/repr() => forget it */
2661 Py_DECREF(*callresult);
2662 /* switch to next unicode()/repr() result */
2663 ++callresult;
2664 break;
2665 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 break;
2669 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 for (; *p; ++p, ++i)
2671 PyUnicode_WRITE(kind, data, i, *p);
2672 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 goto end;
2674 }
Victor Stinner1205f272010-09-11 00:54:47 +00002675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 else {
2677 assert(i < PyUnicode_GET_LENGTH(string));
2678 PyUnicode_WRITE(kind, data, i++, *f);
2679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002682
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 if (callresults)
2685 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002688 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 if (callresults) {
2691 PyObject **callresult2 = callresults;
2692 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002693 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 ++callresult2;
2695 }
2696 PyObject_Free(callresults);
2697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 if (numberresults)
2699 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701}
2702
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703PyObject *
2704PyUnicode_FromFormat(const char *format, ...)
2705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 PyObject* ret;
2707 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
2709#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002711#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 ret = PyUnicode_FromFormatV(format, vargs);
2715 va_end(vargs);
2716 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717}
2718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719#ifdef HAVE_WCHAR_H
2720
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2722 convert a Unicode object to a wide character string.
2723
Victor Stinnerd88d9832011-09-06 02:00:05 +02002724 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725 character) required to convert the unicode object. Ignore size argument.
2726
Victor Stinnerd88d9832011-09-06 02:00:05 +02002727 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002728 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002729 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002731unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 wchar_t *w,
2733 Py_ssize_t size)
2734{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002735 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 const wchar_t *wstr;
2737
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002738 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 if (wstr == NULL)
2740 return -1;
2741
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743 if (size > res)
2744 size = res + 1;
2745 else
2746 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 return res;
2749 }
2750 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002752}
2753
2754Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002755PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 wchar_t *w,
2757 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758{
2759 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 PyErr_BadInternalCall();
2761 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002763 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764}
2765
Victor Stinner137c34c2010-09-29 10:25:54 +00002766wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002767PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002768 Py_ssize_t *size)
2769{
2770 wchar_t* buffer;
2771 Py_ssize_t buflen;
2772
2773 if (unicode == NULL) {
2774 PyErr_BadInternalCall();
2775 return NULL;
2776 }
2777
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002778 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 if (buflen == -1)
2780 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 PyErr_NoMemory();
2783 return NULL;
2784 }
2785
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2787 if (buffer == NULL) {
2788 PyErr_NoMemory();
2789 return NULL;
2790 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002791 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 if (buflen == -1)
2793 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794 if (size != NULL)
2795 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002796 return buffer;
2797}
2798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800
Alexander Belopolsky40018472011-02-26 01:02:56 +00002801PyObject *
2802PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002805 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 PyErr_SetString(PyExc_ValueError,
2807 "chr() arg not in range(0x110000)");
2808 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (ordinal < 256)
2812 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 v = PyUnicode_New(1, ordinal);
2815 if (v == NULL)
2816 return NULL;
2817 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002818 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002820}
2821
Alexander Belopolsky40018472011-02-26 01:02:56 +00002822PyObject *
2823PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002828 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002829 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 Py_INCREF(obj);
2831 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002832 }
2833 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 /* For a Unicode subtype that's not a Unicode object,
2835 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002836 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002837 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002838 PyErr_Format(PyExc_TypeError,
2839 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002840 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002841 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002842}
2843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002846 const char *encoding,
2847 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002848{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002849 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 PyErr_BadInternalCall();
2854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002856
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 /* Decoding bytes objects is the most common case and should be fast */
2858 if (PyBytes_Check(obj)) {
2859 if (PyBytes_GET_SIZE(obj) == 0) {
2860 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002861 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 }
2863 else {
2864 v = PyUnicode_Decode(
2865 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2866 encoding, errors);
2867 }
2868 return v;
2869 }
2870
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002871 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002872 PyErr_SetString(PyExc_TypeError,
2873 "decoding str is not supported");
2874 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002875 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002876
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2878 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2879 PyErr_Format(PyExc_TypeError,
2880 "coercing to str: need bytes, bytearray "
2881 "or buffer-like object, %.80s found",
2882 Py_TYPE(obj)->tp_name);
2883 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002884 }
Tim Petersced69f82003-09-16 20:30:58 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002887 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002888 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 }
Tim Petersced69f82003-09-16 20:30:58 +00002890 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002892
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002894 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895}
2896
Victor Stinner600d3be2010-06-10 12:00:55 +00002897/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002898 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2899 1 on success. */
2900static int
2901normalize_encoding(const char *encoding,
2902 char *lower,
2903 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002905 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002906 char *l;
2907 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002909 if (encoding == NULL) {
2910 strcpy(lower, "utf-8");
2911 return 1;
2912 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002913 e = encoding;
2914 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002915 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002916 while (*e) {
2917 if (l == l_end)
2918 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002919 if (Py_ISUPPER(*e)) {
2920 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002921 }
2922 else if (*e == '_') {
2923 *l++ = '-';
2924 e++;
2925 }
2926 else {
2927 *l++ = *e++;
2928 }
2929 }
2930 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002931 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002932}
2933
Alexander Belopolsky40018472011-02-26 01:02:56 +00002934PyObject *
2935PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002936 Py_ssize_t size,
2937 const char *encoding,
2938 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002939{
2940 PyObject *buffer = NULL, *unicode;
2941 Py_buffer info;
2942 char lower[11]; /* Enough for any encoding shortcut */
2943
Fred Drakee4315f52000-05-09 19:53:39 +00002944 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002945 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002946 if ((strcmp(lower, "utf-8") == 0) ||
2947 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002948 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002949 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002950 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002951 (strcmp(lower, "iso-8859-1") == 0))
2952 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002953#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002954 else if (strcmp(lower, "mbcs") == 0)
2955 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002956#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002957 else if (strcmp(lower, "ascii") == 0)
2958 return PyUnicode_DecodeASCII(s, size, errors);
2959 else if (strcmp(lower, "utf-16") == 0)
2960 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2961 else if (strcmp(lower, "utf-32") == 0)
2962 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964
2965 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002966 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002967 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002968 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002969 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 if (buffer == NULL)
2971 goto onError;
2972 unicode = PyCodec_Decode(buffer, encoding, errors);
2973 if (unicode == NULL)
2974 goto onError;
2975 if (!PyUnicode_Check(unicode)) {
2976 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002977 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002978 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_DECREF(unicode);
2980 goto onError;
2981 }
2982 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002983 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002984
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 Py_XDECREF(buffer);
2987 return NULL;
2988}
2989
Alexander Belopolsky40018472011-02-26 01:02:56 +00002990PyObject *
2991PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002992 const char *encoding,
2993 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002994{
2995 PyObject *v;
2996
2997 if (!PyUnicode_Check(unicode)) {
2998 PyErr_BadArgument();
2999 goto onError;
3000 }
3001
3002 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003004
3005 /* Decode via the codec registry */
3006 v = PyCodec_Decode(unicode, encoding, errors);
3007 if (v == NULL)
3008 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003009 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003010
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 const char *encoding,
3018 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003019{
3020 PyObject *v;
3021
3022 if (!PyUnicode_Check(unicode)) {
3023 PyErr_BadArgument();
3024 goto onError;
3025 }
3026
3027 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003029
3030 /* Decode via the codec registry */
3031 v = PyCodec_Decode(unicode, encoding, errors);
3032 if (v == NULL)
3033 goto onError;
3034 if (!PyUnicode_Check(v)) {
3035 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003036 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037 Py_TYPE(v)->tp_name);
3038 Py_DECREF(v);
3039 goto onError;
3040 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003041 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003042
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044 return NULL;
3045}
3046
Alexander Belopolsky40018472011-02-26 01:02:56 +00003047PyObject *
3048PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003049 Py_ssize_t size,
3050 const char *encoding,
3051 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052{
3053 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003054
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 unicode = PyUnicode_FromUnicode(s, size);
3056 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3059 Py_DECREF(unicode);
3060 return v;
3061}
3062
Alexander Belopolsky40018472011-02-26 01:02:56 +00003063PyObject *
3064PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003065 const char *encoding,
3066 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003067{
3068 PyObject *v;
3069
3070 if (!PyUnicode_Check(unicode)) {
3071 PyErr_BadArgument();
3072 goto onError;
3073 }
3074
3075 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
3078 /* Encode via the codec registry */
3079 v = PyCodec_Encode(unicode, encoding, errors);
3080 if (v == NULL)
3081 goto onError;
3082 return v;
3083
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003085 return NULL;
3086}
3087
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003088static size_t
3089wcstombs_errorpos(const wchar_t *wstr)
3090{
3091 size_t len;
3092#if SIZEOF_WCHAR_T == 2
3093 wchar_t buf[3];
3094#else
3095 wchar_t buf[2];
3096#endif
3097 char outbuf[MB_LEN_MAX];
3098 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003099
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003100#if SIZEOF_WCHAR_T == 2
3101 buf[2] = 0;
3102#else
3103 buf[1] = 0;
3104#endif
3105 start = wstr;
3106 while (*wstr != L'\0')
3107 {
3108 previous = wstr;
3109#if SIZEOF_WCHAR_T == 2
3110 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3111 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3112 {
3113 buf[0] = wstr[0];
3114 buf[1] = wstr[1];
3115 wstr += 2;
3116 }
3117 else {
3118 buf[0] = *wstr;
3119 buf[1] = 0;
3120 wstr++;
3121 }
3122#else
3123 buf[0] = *wstr;
3124 wstr++;
3125#endif
3126 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003127 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003128 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003129 }
3130
3131 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003132 return 0;
3133}
3134
Victor Stinner1b579672011-12-17 05:47:23 +01003135static int
3136locale_error_handler(const char *errors, int *surrogateescape)
3137{
3138 if (errors == NULL) {
3139 *surrogateescape = 0;
3140 return 0;
3141 }
3142
3143 if (strcmp(errors, "strict") == 0) {
3144 *surrogateescape = 0;
3145 return 0;
3146 }
3147 if (strcmp(errors, "surrogateescape") == 0) {
3148 *surrogateescape = 1;
3149 return 0;
3150 }
3151 PyErr_Format(PyExc_ValueError,
3152 "only 'strict' and 'surrogateescape' error handlers "
3153 "are supported, not '%s'",
3154 errors);
3155 return -1;
3156}
3157
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003158PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003159PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160{
3161 Py_ssize_t wlen, wlen2;
3162 wchar_t *wstr;
3163 PyObject *bytes = NULL;
3164 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003165 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003166 PyObject *exc;
3167 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003168 int surrogateescape;
3169
3170 if (locale_error_handler(errors, &surrogateescape) < 0)
3171 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172
3173 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3174 if (wstr == NULL)
3175 return NULL;
3176
3177 wlen2 = wcslen(wstr);
3178 if (wlen2 != wlen) {
3179 PyMem_Free(wstr);
3180 PyErr_SetString(PyExc_TypeError, "embedded null character");
3181 return NULL;
3182 }
3183
3184 if (surrogateescape) {
3185 /* locale encoding with surrogateescape */
3186 char *str;
3187
3188 str = _Py_wchar2char(wstr, &error_pos);
3189 if (str == NULL) {
3190 if (error_pos == (size_t)-1) {
3191 PyErr_NoMemory();
3192 PyMem_Free(wstr);
3193 return NULL;
3194 }
3195 else {
3196 goto encode_error;
3197 }
3198 }
3199 PyMem_Free(wstr);
3200
3201 bytes = PyBytes_FromString(str);
3202 PyMem_Free(str);
3203 }
3204 else {
3205 size_t len, len2;
3206
3207 len = wcstombs(NULL, wstr, 0);
3208 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003209 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003210 goto encode_error;
3211 }
3212
3213 bytes = PyBytes_FromStringAndSize(NULL, len);
3214 if (bytes == NULL) {
3215 PyMem_Free(wstr);
3216 return NULL;
3217 }
3218
3219 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3220 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003221 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003222 goto encode_error;
3223 }
3224 PyMem_Free(wstr);
3225 }
3226 return bytes;
3227
3228encode_error:
3229 errmsg = strerror(errno);
3230 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003231
3232 if (error_pos == (size_t)-1)
3233 error_pos = wcstombs_errorpos(wstr);
3234
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235 PyMem_Free(wstr);
3236 Py_XDECREF(bytes);
3237
Victor Stinner2f197072011-12-17 07:08:30 +01003238 if (errmsg != NULL) {
3239 size_t errlen;
3240 wstr = _Py_char2wchar(errmsg, &errlen);
3241 if (wstr != NULL) {
3242 reason = PyUnicode_FromWideChar(wstr, errlen);
3243 PyMem_Free(wstr);
3244 } else
3245 errmsg = NULL;
3246 }
3247 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003248 reason = PyUnicode_FromString(
3249 "wcstombs() encountered an unencodable "
3250 "wide character");
3251 if (reason == NULL)
3252 return NULL;
3253
3254 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3255 "locale", unicode,
3256 (Py_ssize_t)error_pos,
3257 (Py_ssize_t)(error_pos+1),
3258 reason);
3259 Py_DECREF(reason);
3260 if (exc != NULL) {
3261 PyCodec_StrictErrors(exc);
3262 Py_XDECREF(exc);
3263 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003264 return NULL;
3265}
3266
Victor Stinnerad158722010-10-27 00:25:46 +00003267PyObject *
3268PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003269{
Victor Stinner99b95382011-07-04 14:23:54 +02003270#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003271 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003272#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003273 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003274#else
Victor Stinner793b5312011-04-27 00:24:21 +02003275 PyInterpreterState *interp = PyThreadState_GET()->interp;
3276 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3277 cannot use it to encode and decode filenames before it is loaded. Load
3278 the Python codec requires to encode at least its own filename. Use the C
3279 version of the locale codec until the codec registry is initialized and
3280 the Python codec is loaded.
3281
3282 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3283 cannot only rely on it: check also interp->fscodec_initialized for
3284 subinterpreters. */
3285 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003286 return PyUnicode_AsEncodedString(unicode,
3287 Py_FileSystemDefaultEncoding,
3288 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003289 }
3290 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003291 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003292 }
Victor Stinnerad158722010-10-27 00:25:46 +00003293#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003294}
3295
Alexander Belopolsky40018472011-02-26 01:02:56 +00003296PyObject *
3297PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003298 const char *encoding,
3299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300{
3301 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003302 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003303
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 if (!PyUnicode_Check(unicode)) {
3305 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 }
Fred Drakee4315f52000-05-09 19:53:39 +00003308
Fred Drakee4315f52000-05-09 19:53:39 +00003309 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003310 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003311 if ((strcmp(lower, "utf-8") == 0) ||
3312 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003313 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003314 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003315 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003316 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003318 }
Victor Stinner37296e82010-06-10 13:36:23 +00003319 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003320 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003321 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003323#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003324 else if (strcmp(lower, "mbcs") == 0)
3325 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003326#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003327 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003328 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330
3331 /* Encode via the codec registry */
3332 v = PyCodec_Encode(unicode, encoding, errors);
3333 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003334 return NULL;
3335
3336 /* The normal path */
3337 if (PyBytes_Check(v))
3338 return v;
3339
3340 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003341 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003342 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003343 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003344
3345 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3346 "encoder %s returned bytearray instead of bytes",
3347 encoding);
3348 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003349 Py_DECREF(v);
3350 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003351 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003352
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003353 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3354 Py_DECREF(v);
3355 return b;
3356 }
3357
3358 PyErr_Format(PyExc_TypeError,
3359 "encoder did not return a bytes object (type=%.400s)",
3360 Py_TYPE(v)->tp_name);
3361 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003362 return NULL;
3363}
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 const char *encoding,
3368 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003369{
3370 PyObject *v;
3371
3372 if (!PyUnicode_Check(unicode)) {
3373 PyErr_BadArgument();
3374 goto onError;
3375 }
3376
3377 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003379
3380 /* Encode via the codec registry */
3381 v = PyCodec_Encode(unicode, encoding, errors);
3382 if (v == NULL)
3383 goto onError;
3384 if (!PyUnicode_Check(v)) {
3385 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003386 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003387 Py_TYPE(v)->tp_name);
3388 Py_DECREF(v);
3389 goto onError;
3390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003392
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 return NULL;
3395}
3396
Victor Stinner2f197072011-12-17 07:08:30 +01003397static size_t
3398mbstowcs_errorpos(const char *str, size_t len)
3399{
3400#ifdef HAVE_MBRTOWC
3401 const char *start = str;
3402 mbstate_t mbs;
3403 size_t converted;
3404 wchar_t ch;
3405
3406 memset(&mbs, 0, sizeof mbs);
3407 while (len)
3408 {
3409 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3410 if (converted == 0)
3411 /* Reached end of string */
3412 break;
3413 if (converted == (size_t)-1 || converted == (size_t)-2) {
3414 /* Conversion error or incomplete character */
3415 return str - start;
3416 }
3417 else {
3418 str += converted;
3419 len -= converted;
3420 }
3421 }
3422 /* failed to find the undecodable byte sequence */
3423 return 0;
3424#endif
3425 return 0;
3426}
3427
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003428PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003429PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003430 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003431{
3432 wchar_t smallbuf[256];
3433 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3434 wchar_t *wstr;
3435 size_t wlen, wlen2;
3436 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003437 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003438 size_t error_pos;
3439 char *errmsg;
3440 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003441
3442 if (locale_error_handler(errors, &surrogateescape) < 0)
3443 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003444
3445 if (str[len] != '\0' || len != strlen(str)) {
3446 PyErr_SetString(PyExc_TypeError, "embedded null character");
3447 return NULL;
3448 }
3449
3450 if (surrogateescape)
3451 {
3452 wstr = _Py_char2wchar(str, &wlen);
3453 if (wstr == NULL) {
3454 if (wlen == (size_t)-1)
3455 PyErr_NoMemory();
3456 else
3457 PyErr_SetFromErrno(PyExc_OSError);
3458 return NULL;
3459 }
3460
3461 unicode = PyUnicode_FromWideChar(wstr, wlen);
3462 PyMem_Free(wstr);
3463 }
3464 else {
3465#ifndef HAVE_BROKEN_MBSTOWCS
3466 wlen = mbstowcs(NULL, str, 0);
3467#else
3468 wlen = len;
3469#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003470 if (wlen == (size_t)-1)
3471 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003472 if (wlen+1 <= smallbuf_len) {
3473 wstr = smallbuf;
3474 }
3475 else {
3476 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3477 return PyErr_NoMemory();
3478
3479 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3480 if (!wstr)
3481 return PyErr_NoMemory();
3482 }
3483
3484 /* This shouldn't fail now */
3485 wlen2 = mbstowcs(wstr, str, wlen+1);
3486 if (wlen2 == (size_t)-1) {
3487 if (wstr != smallbuf)
3488 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003489 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003490 }
3491#ifdef HAVE_BROKEN_MBSTOWCS
3492 assert(wlen2 == wlen);
3493#endif
3494 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3495 if (wstr != smallbuf)
3496 PyMem_Free(wstr);
3497 }
3498 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003499
3500decode_error:
3501 errmsg = strerror(errno);
3502 assert(errmsg != NULL);
3503
3504 error_pos = mbstowcs_errorpos(str, len);
3505 if (errmsg != NULL) {
3506 size_t errlen;
3507 wstr = _Py_char2wchar(errmsg, &errlen);
3508 if (wstr != NULL) {
3509 reason = PyUnicode_FromWideChar(wstr, errlen);
3510 PyMem_Free(wstr);
3511 } else
3512 errmsg = NULL;
3513 }
3514 if (errmsg == NULL)
3515 reason = PyUnicode_FromString(
3516 "mbstowcs() encountered an invalid multibyte sequence");
3517 if (reason == NULL)
3518 return NULL;
3519
3520 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3521 "locale", str, len,
3522 (Py_ssize_t)error_pos,
3523 (Py_ssize_t)(error_pos+1),
3524 reason);
3525 Py_DECREF(reason);
3526 if (exc != NULL) {
3527 PyCodec_StrictErrors(exc);
3528 Py_XDECREF(exc);
3529 }
3530 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531}
3532
3533PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003534PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003535{
3536 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003537 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003538}
3539
3540
3541PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003542PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003543 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003544 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3545}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003546
Christian Heimes5894ba72007-11-04 11:43:14 +00003547PyObject*
3548PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3549{
Victor Stinner99b95382011-07-04 14:23:54 +02003550#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003551 return PyUnicode_DecodeMBCS(s, size, NULL);
3552#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003553 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003554#else
Victor Stinner793b5312011-04-27 00:24:21 +02003555 PyInterpreterState *interp = PyThreadState_GET()->interp;
3556 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3557 cannot use it to encode and decode filenames before it is loaded. Load
3558 the Python codec requires to encode at least its own filename. Use the C
3559 version of the locale codec until the codec registry is initialized and
3560 the Python codec is loaded.
3561
3562 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3563 cannot only rely on it: check also interp->fscodec_initialized for
3564 subinterpreters. */
3565 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003566 return PyUnicode_Decode(s, size,
3567 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003568 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003569 }
3570 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003571 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572 }
Victor Stinnerad158722010-10-27 00:25:46 +00003573#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003574}
3575
Martin v. Löwis011e8422009-05-05 04:43:17 +00003576
3577int
3578PyUnicode_FSConverter(PyObject* arg, void* addr)
3579{
3580 PyObject *output = NULL;
3581 Py_ssize_t size;
3582 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003583 if (arg == NULL) {
3584 Py_DECREF(*(PyObject**)addr);
3585 return 1;
3586 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003587 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003588 output = arg;
3589 Py_INCREF(output);
3590 }
3591 else {
3592 arg = PyUnicode_FromObject(arg);
3593 if (!arg)
3594 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003595 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003596 Py_DECREF(arg);
3597 if (!output)
3598 return 0;
3599 if (!PyBytes_Check(output)) {
3600 Py_DECREF(output);
3601 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3602 return 0;
3603 }
3604 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003605 size = PyBytes_GET_SIZE(output);
3606 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003607 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003608 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003609 Py_DECREF(output);
3610 return 0;
3611 }
3612 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003613 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003614}
3615
3616
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003617int
3618PyUnicode_FSDecoder(PyObject* arg, void* addr)
3619{
3620 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003621 if (arg == NULL) {
3622 Py_DECREF(*(PyObject**)addr);
3623 return 1;
3624 }
3625 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003626 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003627 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003628 output = arg;
3629 Py_INCREF(output);
3630 }
3631 else {
3632 arg = PyBytes_FromObject(arg);
3633 if (!arg)
3634 return 0;
3635 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3636 PyBytes_GET_SIZE(arg));
3637 Py_DECREF(arg);
3638 if (!output)
3639 return 0;
3640 if (!PyUnicode_Check(output)) {
3641 Py_DECREF(output);
3642 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3643 return 0;
3644 }
3645 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003646 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003647 Py_DECREF(output);
3648 return 0;
3649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003650 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003651 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003652 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3653 Py_DECREF(output);
3654 return 0;
3655 }
3656 *(PyObject**)addr = output;
3657 return Py_CLEANUP_SUPPORTED;
3658}
3659
3660
Martin v. Löwis5b222132007-06-10 09:51:05 +00003661char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003662PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003663{
Christian Heimesf3863112007-11-22 07:46:41 +00003664 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003666 if (!PyUnicode_Check(unicode)) {
3667 PyErr_BadArgument();
3668 return NULL;
3669 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003670 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003671 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003672
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003673 if (PyUnicode_UTF8(unicode) == NULL) {
3674 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003675 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3676 if (bytes == NULL)
3677 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003678 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3679 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680 Py_DECREF(bytes);
3681 return NULL;
3682 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003683 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3684 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3685 PyBytes_AS_STRING(bytes),
3686 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687 Py_DECREF(bytes);
3688 }
3689
3690 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003691 *psize = PyUnicode_UTF8_LENGTH(unicode);
3692 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003693}
3694
3695char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003696PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003698 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3699}
3700
3701#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003702static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003703#endif
3704
3705
3706Py_UNICODE *
3707PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 const unsigned char *one_byte;
3710#if SIZEOF_WCHAR_T == 4
3711 const Py_UCS2 *two_bytes;
3712#else
3713 const Py_UCS4 *four_bytes;
3714 const Py_UCS4 *ucs4_end;
3715 Py_ssize_t num_surrogates;
3716#endif
3717 wchar_t *w;
3718 wchar_t *wchar_end;
3719
3720 if (!PyUnicode_Check(unicode)) {
3721 PyErr_BadArgument();
3722 return NULL;
3723 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003724 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003725 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003726 assert(_PyUnicode_KIND(unicode) != 0);
3727 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728
3729#ifdef Py_DEBUG
3730 ++unicode_as_unicode_calls;
3731#endif
3732
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003733 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003735 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3736 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 num_surrogates = 0;
3738
3739 for (; four_bytes < ucs4_end; ++four_bytes) {
3740 if (*four_bytes > 0xFFFF)
3741 ++num_surrogates;
3742 }
3743
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003744 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3745 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3746 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747 PyErr_NoMemory();
3748 return NULL;
3749 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 w = _PyUnicode_WSTR(unicode);
3753 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3754 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3756 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003757 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003759 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3760 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 }
3762 else
3763 *w = *four_bytes;
3764
3765 if (w > wchar_end) {
3766 assert(0 && "Miscalculated string end");
3767 }
3768 }
3769 *w = 0;
3770#else
3771 /* sizeof(wchar_t) == 4 */
3772 Py_FatalError("Impossible unicode object state, wstr and str "
3773 "should share memory already.");
3774 return NULL;
3775#endif
3776 }
3777 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003778 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3779 (_PyUnicode_LENGTH(unicode) + 1));
3780 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 PyErr_NoMemory();
3782 return NULL;
3783 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3785 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3786 w = _PyUnicode_WSTR(unicode);
3787 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003789 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3790 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 for (; w < wchar_end; ++one_byte, ++w)
3792 *w = *one_byte;
3793 /* null-terminate the wstr */
3794 *w = 0;
3795 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003796 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003798 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799 for (; w < wchar_end; ++two_bytes, ++w)
3800 *w = *two_bytes;
3801 /* null-terminate the wstr */
3802 *w = 0;
3803#else
3804 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003805 PyObject_FREE(_PyUnicode_WSTR(unicode));
3806 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807 Py_FatalError("Impossible unicode object state, wstr "
3808 "and str should share memory already.");
3809 return NULL;
3810#endif
3811 }
3812 else {
3813 assert(0 && "This should never happen.");
3814 }
3815 }
3816 }
3817 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 *size = PyUnicode_WSTR_LENGTH(unicode);
3819 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003820}
3821
Alexander Belopolsky40018472011-02-26 01:02:56 +00003822Py_UNICODE *
3823PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826}
3827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828
Alexander Belopolsky40018472011-02-26 01:02:56 +00003829Py_ssize_t
3830PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831{
3832 if (!PyUnicode_Check(unicode)) {
3833 PyErr_BadArgument();
3834 goto onError;
3835 }
3836 return PyUnicode_GET_SIZE(unicode);
3837
Benjamin Peterson29060642009-01-31 22:14:21 +00003838 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 return -1;
3840}
3841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842Py_ssize_t
3843PyUnicode_GetLength(PyObject *unicode)
3844{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003845 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846 PyErr_BadArgument();
3847 return -1;
3848 }
3849
3850 return PyUnicode_GET_LENGTH(unicode);
3851}
3852
3853Py_UCS4
3854PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3855{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003856 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3857 PyErr_BadArgument();
3858 return (Py_UCS4)-1;
3859 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003860 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003861 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862 return (Py_UCS4)-1;
3863 }
3864 return PyUnicode_READ_CHAR(unicode, index);
3865}
3866
3867int
3868PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3869{
3870 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003871 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872 return -1;
3873 }
Victor Stinner488fa492011-12-12 00:01:39 +01003874 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003875 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003876 PyErr_SetString(PyExc_IndexError, "string index out of range");
3877 return -1;
3878 }
Victor Stinner488fa492011-12-12 00:01:39 +01003879 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003880 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3882 index, ch);
3883 return 0;
3884}
3885
Alexander Belopolsky40018472011-02-26 01:02:56 +00003886const char *
3887PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003888{
Victor Stinner42cb4622010-09-01 19:39:01 +00003889 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003890}
3891
Victor Stinner554f3f02010-06-16 23:33:54 +00003892/* create or adjust a UnicodeDecodeError */
3893static void
3894make_decode_exception(PyObject **exceptionObject,
3895 const char *encoding,
3896 const char *input, Py_ssize_t length,
3897 Py_ssize_t startpos, Py_ssize_t endpos,
3898 const char *reason)
3899{
3900 if (*exceptionObject == NULL) {
3901 *exceptionObject = PyUnicodeDecodeError_Create(
3902 encoding, input, length, startpos, endpos, reason);
3903 }
3904 else {
3905 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3906 goto onError;
3907 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3908 goto onError;
3909 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3910 goto onError;
3911 }
3912 return;
3913
3914onError:
3915 Py_DECREF(*exceptionObject);
3916 *exceptionObject = NULL;
3917}
3918
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919/* error handling callback helper:
3920 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003921 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 and adjust various state variables.
3923 return 0 on success, -1 on error
3924*/
3925
Alexander Belopolsky40018472011-02-26 01:02:56 +00003926static int
3927unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003928 const char *encoding, const char *reason,
3929 const char **input, const char **inend, Py_ssize_t *startinpos,
3930 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003931 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003933 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934
3935 PyObject *restuple = NULL;
3936 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003937 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003938 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003939 Py_ssize_t requiredsize;
3940 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003941 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 int res = -1;
3943
Victor Stinner596a6c42011-11-09 00:02:18 +01003944 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3945 outsize = PyUnicode_GET_LENGTH(*output);
3946 else
3947 outsize = _PyUnicode_WSTR_LENGTH(*output);
3948
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 *errorHandler = PyCodec_LookupError(errors);
3951 if (*errorHandler == NULL)
3952 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 }
3954
Victor Stinner554f3f02010-06-16 23:33:54 +00003955 make_decode_exception(exceptionObject,
3956 encoding,
3957 *input, *inend - *input,
3958 *startinpos, *endinpos,
3959 reason);
3960 if (*exceptionObject == NULL)
3961 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962
3963 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3964 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003967 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003968 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 }
3970 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003972 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003973 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003974
3975 /* Copy back the bytes variables, which might have been modified by the
3976 callback */
3977 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3978 if (!inputobj)
3979 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003980 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003982 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003983 *input = PyBytes_AS_STRING(inputobj);
3984 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003985 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003986 /* we can DECREF safely, as the exception has another reference,
3987 so the object won't go away. */
3988 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003989
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003992 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3994 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003995 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996
Victor Stinner596a6c42011-11-09 00:02:18 +01003997 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3998 /* need more space? (at least enough for what we
3999 have+the replacement+the rest of the string (starting
4000 at the new input position), so we won't have to check space
4001 when there are no errors in the rest of the string) */
4002 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4003 requiredsize = *outpos + replen + insize-newpos;
4004 if (requiredsize > outsize) {
4005 if (requiredsize<2*outsize)
4006 requiredsize = 2*outsize;
4007 if (unicode_resize(output, requiredsize) < 0)
4008 goto onError;
4009 }
4010 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004012 copy_characters(*output, *outpos, repunicode, 0, replen);
4013 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004015 else {
4016 wchar_t *repwstr;
4017 Py_ssize_t repwlen;
4018 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4019 if (repwstr == NULL)
4020 goto onError;
4021 /* need more space? (at least enough for what we
4022 have+the replacement+the rest of the string (starting
4023 at the new input position), so we won't have to check space
4024 when there are no errors in the rest of the string) */
4025 requiredsize = *outpos + repwlen + insize-newpos;
4026 if (requiredsize > outsize) {
4027 if (requiredsize < 2*outsize)
4028 requiredsize = 2*outsize;
4029 if (unicode_resize(output, requiredsize) < 0)
4030 goto onError;
4031 }
4032 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4033 *outpos += repwlen;
4034 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004035 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004036 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 /* we made it! */
4039 res = 0;
4040
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 Py_XDECREF(restuple);
4043 return res;
4044}
4045
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004046/* --- UTF-7 Codec -------------------------------------------------------- */
4047
Antoine Pitrou244651a2009-05-04 18:56:13 +00004048/* See RFC2152 for details. We encode conservatively and decode liberally. */
4049
4050/* Three simple macros defining base-64. */
4051
4052/* Is c a base-64 character? */
4053
4054#define IS_BASE64(c) \
4055 (((c) >= 'A' && (c) <= 'Z') || \
4056 ((c) >= 'a' && (c) <= 'z') || \
4057 ((c) >= '0' && (c) <= '9') || \
4058 (c) == '+' || (c) == '/')
4059
4060/* given that c is a base-64 character, what is its base-64 value? */
4061
4062#define FROM_BASE64(c) \
4063 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4064 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4065 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4066 (c) == '+' ? 62 : 63)
4067
4068/* What is the base-64 character of the bottom 6 bits of n? */
4069
4070#define TO_BASE64(n) \
4071 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4072
4073/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4074 * decoded as itself. We are permissive on decoding; the only ASCII
4075 * byte not decoding to itself is the + which begins a base64
4076 * string. */
4077
4078#define DECODE_DIRECT(c) \
4079 ((c) <= 127 && (c) != '+')
4080
4081/* The UTF-7 encoder treats ASCII characters differently according to
4082 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4083 * the above). See RFC2152. This array identifies these different
4084 * sets:
4085 * 0 : "Set D"
4086 * alphanumeric and '(),-./:?
4087 * 1 : "Set O"
4088 * !"#$%&*;<=>@[]^_`{|}
4089 * 2 : "whitespace"
4090 * ht nl cr sp
4091 * 3 : special (must be base64 encoded)
4092 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4093 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004094
Tim Petersced69f82003-09-16 20:30:58 +00004095static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004096char utf7_category[128] = {
4097/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4098 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4099/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4100 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4101/* sp ! " # $ % & ' ( ) * + , - . / */
4102 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4103/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4105/* @ A B C D E F G H I J K L M N O */
4106 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4107/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4109/* ` a b c d e f g h i j k l m n o */
4110 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4111/* p q r s t u v w x y z { | } ~ del */
4112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004113};
4114
Antoine Pitrou244651a2009-05-04 18:56:13 +00004115/* ENCODE_DIRECT: this character should be encoded as itself. The
4116 * answer depends on whether we are encoding set O as itself, and also
4117 * on whether we are encoding whitespace as itself. RFC2152 makes it
4118 * clear that the answers to these questions vary between
4119 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004120
Antoine Pitrou244651a2009-05-04 18:56:13 +00004121#define ENCODE_DIRECT(c, directO, directWS) \
4122 ((c) < 128 && (c) > 0 && \
4123 ((utf7_category[(c)] == 0) || \
4124 (directWS && (utf7_category[(c)] == 2)) || \
4125 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004126
Alexander Belopolsky40018472011-02-26 01:02:56 +00004127PyObject *
4128PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004129 Py_ssize_t size,
4130 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004131{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004132 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4133}
4134
Antoine Pitrou244651a2009-05-04 18:56:13 +00004135/* The decoder. The only state we preserve is our read position,
4136 * i.e. how many characters we have consumed. So if we end in the
4137 * middle of a shift sequence we have to back off the read position
4138 * and the output to the beginning of the sequence, otherwise we lose
4139 * all the shift state (seen bits, number of bits seen, high
4140 * surrogate). */
4141
Alexander Belopolsky40018472011-02-26 01:02:56 +00004142PyObject *
4143PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004144 Py_ssize_t size,
4145 const char *errors,
4146 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004147{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004149 Py_ssize_t startinpos;
4150 Py_ssize_t endinpos;
4151 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004152 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004153 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004154 const char *errmsg = "";
4155 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004156 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004157 unsigned int base64bits = 0;
4158 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004159 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 PyObject *errorHandler = NULL;
4161 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004162
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004163 /* Start off assuming it's all ASCII. Widen later as necessary. */
4164 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165 if (!unicode)
4166 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004167 if (size == 0) {
4168 if (consumed)
4169 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004170 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004171 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004172
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004173 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004174 e = s + size;
4175
4176 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004177 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004179 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004180
Antoine Pitrou244651a2009-05-04 18:56:13 +00004181 if (inShift) { /* in a base-64 section */
4182 if (IS_BASE64(ch)) { /* consume a base-64 character */
4183 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4184 base64bits += 6;
4185 s++;
4186 if (base64bits >= 16) {
4187 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004188 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004189 base64bits -= 16;
4190 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4191 if (surrogate) {
4192 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004193 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4194 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004195 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4196 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004197 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004198 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004199 }
4200 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004201 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4202 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004203 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004204 }
4205 }
Victor Stinner551ac952011-11-29 22:58:13 +01004206 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004207 /* first surrogate */
4208 surrogate = outCh;
4209 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004210 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004211 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4212 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004213 }
4214 }
4215 }
4216 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004217 inShift = 0;
4218 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004220 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4221 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004222 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004223 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004224 if (base64bits > 0) { /* left-over bits */
4225 if (base64bits >= 6) {
4226 /* We've seen at least one base-64 character */
4227 errmsg = "partial character in shift sequence";
4228 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004229 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004230 else {
4231 /* Some bits remain; they should be zero */
4232 if (base64buffer != 0) {
4233 errmsg = "non-zero padding bits in shift sequence";
4234 goto utf7Error;
4235 }
4236 }
4237 }
4238 if (ch != '-') {
4239 /* '-' is absorbed; other terminating
4240 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004241 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4242 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004243 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004244 }
4245 }
4246 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004248 s++; /* consume '+' */
4249 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004250 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004251 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4252 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004253 }
4254 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004255 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004256 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004257 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004258 }
4259 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004260 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004261 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4262 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263 s++;
4264 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004265 else {
4266 startinpos = s-starts;
4267 s++;
4268 errmsg = "unexpected special character";
4269 goto utf7Error;
4270 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004273 endinpos = s-starts;
4274 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 errors, &errorHandler,
4276 "utf7", errmsg,
4277 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004278 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004279 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280 }
4281
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282 /* end of string */
4283
4284 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4285 /* if we're in an inconsistent state, that's an error */
4286 if (surrogate ||
4287 (base64bits >= 6) ||
4288 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004289 endinpos = size;
4290 if (unicode_decode_call_errorhandler(
4291 errors, &errorHandler,
4292 "utf7", "unterminated shift sequence",
4293 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004294 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 goto onError;
4296 if (s < e)
4297 goto restart;
4298 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004299 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004300
4301 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004302 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004304 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004305 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306 }
4307 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004308 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004310 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004311
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004312 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 goto onError;
4314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 Py_XDECREF(errorHandler);
4316 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004317 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318
Benjamin Peterson29060642009-01-31 22:14:21 +00004319 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 Py_XDECREF(errorHandler);
4321 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322 Py_DECREF(unicode);
4323 return NULL;
4324}
4325
4326
Alexander Belopolsky40018472011-02-26 01:02:56 +00004327PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004328_PyUnicode_EncodeUTF7(PyObject *str,
4329 int base64SetO,
4330 int base64WhiteSpace,
4331 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004333 int kind;
4334 void *data;
4335 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004336 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004337 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004339 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340 unsigned int base64bits = 0;
4341 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342 char * out;
4343 char * start;
4344
Benjamin Petersonbac79492012-01-14 13:34:47 -05004345 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004346 return NULL;
4347 kind = PyUnicode_KIND(str);
4348 data = PyUnicode_DATA(str);
4349 len = PyUnicode_GET_LENGTH(str);
4350
4351 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004354 /* It might be possible to tighten this worst case */
4355 allocated = 8 * len;
4356 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004357 return PyErr_NoMemory();
4358
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360 if (v == NULL)
4361 return NULL;
4362
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004363 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004364 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004365 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 if (inShift) {
4368 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4369 /* shifting out */
4370 if (base64bits) { /* output remaining bits */
4371 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4372 base64buffer = 0;
4373 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 }
4375 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 /* Characters not in the BASE64 set implicitly unshift the sequence
4377 so no '-' is required, except if the character is itself a '-' */
4378 if (IS_BASE64(ch) || ch == '-') {
4379 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 *out++ = (char) ch;
4382 }
4383 else {
4384 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004385 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 else { /* not in a shift sequence */
4388 if (ch == '+') {
4389 *out++ = '+';
4390 *out++ = '-';
4391 }
4392 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4393 *out++ = (char) ch;
4394 }
4395 else {
4396 *out++ = '+';
4397 inShift = 1;
4398 goto encode_char;
4399 }
4400 }
4401 continue;
4402encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004404 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004405
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 /* code first surrogate */
4407 base64bits += 16;
4408 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4409 while (base64bits >= 6) {
4410 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4411 base64bits -= 6;
4412 }
4413 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004414 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 base64bits += 16;
4417 base64buffer = (base64buffer << 16) | ch;
4418 while (base64bits >= 6) {
4419 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4420 base64bits -= 6;
4421 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004422 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 if (base64bits)
4424 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4425 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004427 if (_PyBytes_Resize(&v, out - start) < 0)
4428 return NULL;
4429 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004431PyObject *
4432PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4433 Py_ssize_t size,
4434 int base64SetO,
4435 int base64WhiteSpace,
4436 const char *errors)
4437{
4438 PyObject *result;
4439 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4440 if (tmp == NULL)
4441 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004442 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004443 base64WhiteSpace, errors);
4444 Py_DECREF(tmp);
4445 return result;
4446}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004447
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448#undef IS_BASE64
4449#undef FROM_BASE64
4450#undef TO_BASE64
4451#undef DECODE_DIRECT
4452#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454/* --- UTF-8 Codec -------------------------------------------------------- */
4455
Tim Petersced69f82003-09-16 20:30:58 +00004456static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004458 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4459 illegal prefix. See RFC 3629 for details */
4460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4464 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4466 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004467 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4472 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4473 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4474 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4475 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476};
4477
Alexander Belopolsky40018472011-02-26 01:02:56 +00004478PyObject *
4479PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004480 Py_ssize_t size,
4481 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482{
Walter Dörwald69652032004-09-07 20:24:22 +00004483 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4484}
4485
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004486#include "stringlib/ucs1lib.h"
4487#include "stringlib/codecs.h"
4488#include "stringlib/undef.h"
4489
4490#include "stringlib/ucs2lib.h"
4491#include "stringlib/codecs.h"
4492#include "stringlib/undef.h"
4493
4494#include "stringlib/ucs4lib.h"
4495#include "stringlib/codecs.h"
4496#include "stringlib/undef.h"
4497
Antoine Pitrouab868312009-01-10 15:40:25 +00004498/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4499#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4500
4501/* Mask to quickly check whether a C 'long' contains a
4502 non-ASCII, UTF8-encoded char. */
4503#if (SIZEOF_LONG == 8)
4504# define ASCII_CHAR_MASK 0x8080808080808080L
4505#elif (SIZEOF_LONG == 4)
4506# define ASCII_CHAR_MASK 0x80808080L
4507#else
4508# error C 'long' size should be either 4 or 8!
4509#endif
4510
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004511/* Scans a UTF-8 string and returns the maximum character to be expected
4512 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004513
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004514 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004515 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004516 */
4517static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004518utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004521 const unsigned char *end = p + string_size;
4522 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004523
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004524 assert(unicode_size != NULL);
4525
4526 /* By having a cascade of independent loops which fallback onto each
4527 other, we minimize the amount of work done in the average loop
4528 iteration, and we also maximize the CPU's ability to predict
4529 branches correctly (because a given condition will have always the
4530 same boolean outcome except perhaps in the last iteration of the
4531 corresponding loop).
4532 In the general case this brings us rather close to decoding
4533 performance pre-PEP 393, despite the two-pass decoding.
4534
4535 Note that the pure ASCII loop is not duplicated once a non-ASCII
4536 character has been encountered. It is actually a pessimization (by
4537 a significant factor) to use this loop on text with many non-ASCII
4538 characters, and it is important to avoid bad performance on valid
4539 utf-8 data (invalid utf-8 being a different can of worms).
4540 */
4541
4542 /* ASCII */
4543 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004544 /* Only check value if it's not a ASCII char... */
4545 if (*p < 0x80) {
4546 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4547 an explanation. */
4548 if (!((size_t) p & LONG_PTR_MASK)) {
4549 /* Help register allocation */
4550 register const unsigned char *_p = p;
4551 while (_p < aligned_end) {
4552 unsigned long value = *(unsigned long *) _p;
4553 if (value & ASCII_CHAR_MASK)
4554 break;
4555 _p += SIZEOF_LONG;
4556 char_count += SIZEOF_LONG;
4557 }
4558 p = _p;
4559 if (p == end)
4560 break;
4561 }
4562 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004563 if (*p < 0x80)
4564 ++char_count;
4565 else
4566 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004567 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004568 *unicode_size = char_count;
4569 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004570
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004571_ucs1loop:
4572 for (; p < end; ++p) {
4573 if (*p < 0xc4)
4574 char_count += ((*p & 0xc0) != 0x80);
4575 else
4576 goto _ucs2loop;
4577 }
4578 *unicode_size = char_count;
4579 return 255;
4580
4581_ucs2loop:
4582 for (; p < end; ++p) {
4583 if (*p < 0xf0)
4584 char_count += ((*p & 0xc0) != 0x80);
4585 else
4586 goto _ucs4loop;
4587 }
4588 *unicode_size = char_count;
4589 return 65535;
4590
4591_ucs4loop:
4592 for (; p < end; ++p) {
4593 char_count += ((*p & 0xc0) != 0x80);
4594 }
4595 *unicode_size = char_count;
4596 return 65537;
4597}
4598
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004599/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004600 in case of errors. Implicit parameters: unicode, kind, data, onError.
4601 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004602*/
Victor Stinner785938e2011-12-11 20:09:03 +01004603#define WRITE_MAYBE_FAIL(index, value) \
4604 do { \
4605 Py_ssize_t pos = index; \
4606 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4607 unicode_resize(&unicode, pos + pos/8) < 0) \
4608 goto onError; \
4609 if (unicode_putchar(&unicode, &pos, value) < 0) \
4610 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004611 } while (0)
4612
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004613static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004614decode_utf8_errors(const char *starts,
4615 Py_ssize_t size,
4616 const char *errors,
4617 Py_ssize_t *consumed,
4618 const char *s,
4619 PyObject *unicode,
4620 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004621{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004623 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004624 Py_ssize_t startinpos;
4625 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004626 const char *e = starts + size;
4627 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004628 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 PyObject *errorHandler = NULL;
4630 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004631
Antoine Pitrouab868312009-01-10 15:40:25 +00004632 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633
4634 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004635 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636
4637 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004638 /* Fast path for runs of ASCII characters. Given that common UTF-8
4639 input will consist of an overwhelming majority of ASCII
4640 characters, we try to optimize for this case by checking
4641 as many characters as a C 'long' can contain.
4642 First, check if we can do an aligned read, as most CPUs have
4643 a penalty for unaligned reads.
4644 */
4645 if (!((size_t) s & LONG_PTR_MASK)) {
4646 /* Help register allocation */
4647 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004648 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004649 while (_s < aligned_end) {
4650 /* Read a whole long at a time (either 4 or 8 bytes),
4651 and do a fast unrolled copy if it only contains ASCII
4652 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653 unsigned long value = *(unsigned long *) _s;
4654 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004655 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004656 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4657 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4658 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4659 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004660#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004661 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4662 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4663 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4664 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004665#endif
4666 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004667 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004668 }
4669 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004670 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004671 if (s == e)
4672 break;
4673 ch = (unsigned char)*s;
4674 }
4675 }
4676
4677 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004678 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 s++;
4680 continue;
4681 }
4682
4683 n = utf8_code_length[ch];
4684
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004685 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 if (consumed)
4687 break;
4688 else {
4689 errmsg = "unexpected end of data";
4690 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004691 endinpos = startinpos+1;
4692 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4693 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 goto utf8Error;
4695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
4698 switch (n) {
4699
4700 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004701 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 startinpos = s-starts;
4703 endinpos = startinpos+1;
4704 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705
4706 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004707 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004708 startinpos = s-starts;
4709 endinpos = startinpos+1;
4710 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711
4712 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004713 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004714 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004716 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004717 goto utf8Error;
4718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004720 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004721 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 break;
4723
4724 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004725 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4726 will result in surrogates in range d800-dfff. Surrogates are
4727 not valid UTF-8 so they are rejected.
4728 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4729 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004730 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004731 (s[2] & 0xc0) != 0x80 ||
4732 ((unsigned char)s[0] == 0xE0 &&
4733 (unsigned char)s[1] < 0xA0) ||
4734 ((unsigned char)s[0] == 0xED &&
4735 (unsigned char)s[1] > 0x9F)) {
4736 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004737 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004738 endinpos = startinpos + 1;
4739
4740 /* if s[1] first two bits are 1 and 0, then the invalid
4741 continuation byte is s[2], so increment endinpos by 1,
4742 if not, s[1] is invalid and endinpos doesn't need to
4743 be incremented. */
4744 if ((s[1] & 0xC0) == 0x80)
4745 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 goto utf8Error;
4747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004749 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004750 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004751 break;
4752
4753 case 4:
4754 if ((s[1] & 0xc0) != 0x80 ||
4755 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004756 (s[3] & 0xc0) != 0x80 ||
4757 ((unsigned char)s[0] == 0xF0 &&
4758 (unsigned char)s[1] < 0x90) ||
4759 ((unsigned char)s[0] == 0xF4 &&
4760 (unsigned char)s[1] > 0x8F)) {
4761 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004763 endinpos = startinpos + 1;
4764 if ((s[1] & 0xC0) == 0x80) {
4765 endinpos++;
4766 if ((s[2] & 0xC0) == 0x80)
4767 endinpos++;
4768 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004769 goto utf8Error;
4770 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004771 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004772 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004773 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004774
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004775 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 }
4778 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004780
Benjamin Peterson29060642009-01-31 22:14:21 +00004781 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 if (unicode_decode_call_errorhandler(
4783 errors, &errorHandler,
4784 "utf8", errmsg,
4785 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004786 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 /* Update data because unicode_decode_call_errorhandler might have
4789 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 }
Walter Dörwald69652032004-09-07 20:24:22 +00004792 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004795 /* Adjust length and ready string when it contained errors and
4796 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004797 if (unicode_resize(&unicode, i) < 0)
4798 goto onError;
4799 unicode_adjust_maxchar(&unicode);
4800 if (unicode == NULL)
4801 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 Py_XDECREF(errorHandler);
4804 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004805 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004806 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 Py_XDECREF(errorHandler);
4810 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004811 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 return NULL;
4813}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004814#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004815
Victor Stinner785938e2011-12-11 20:09:03 +01004816PyObject *
4817PyUnicode_DecodeUTF8Stateful(const char *s,
4818 Py_ssize_t size,
4819 const char *errors,
4820 Py_ssize_t *consumed)
4821{
4822 Py_UCS4 maxchar = 0;
4823 Py_ssize_t unicode_size;
4824 int has_errors = 0;
4825 PyObject *unicode;
4826 int kind;
4827 void *data;
4828 const char *starts = s;
4829 const char *e;
4830 Py_ssize_t i;
4831
4832 if (size == 0) {
4833 if (consumed)
4834 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004835 Py_INCREF(unicode_empty);
4836 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004837 }
4838
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004839 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004840
4841 /* When the string is ASCII only, just use memcpy and return.
4842 unicode_size may be != size if there is an incomplete UTF-8
4843 sequence at the end of the ASCII block. */
4844 if (maxchar < 128 && size == unicode_size) {
4845 if (consumed)
4846 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004847 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004848 }
4849
4850 unicode = PyUnicode_New(unicode_size, maxchar);
4851 if (!unicode)
4852 return NULL;
4853 kind = PyUnicode_KIND(unicode);
4854 data = PyUnicode_DATA(unicode);
4855
4856 /* Unpack UTF-8 encoded data */
4857 i = 0;
4858 e = starts + size;
4859 switch (kind) {
4860 case PyUnicode_1BYTE_KIND:
4861 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4862 break;
4863 case PyUnicode_2BYTE_KIND:
4864 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4865 break;
4866 case PyUnicode_4BYTE_KIND:
4867 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4868 break;
4869 }
4870 if (!has_errors) {
4871 /* Ensure the unicode size calculation was correct */
4872 assert(i == unicode_size);
4873 assert(s == e);
4874 if (consumed)
4875 *consumed = size;
4876 return unicode;
4877 }
4878
4879 /* In case of errors, maxchar and size computation might be incorrect;
4880 code below refits and resizes as necessary. */
4881 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4882}
4883
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004884#ifdef __APPLE__
4885
4886/* Simplified UTF-8 decoder using surrogateescape error handler,
4887 used to decode the command line arguments on Mac OS X. */
4888
4889wchar_t*
4890_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4891{
4892 int n;
4893 const char *e;
4894 wchar_t *unicode, *p;
4895
4896 /* Note: size will always be longer than the resulting Unicode
4897 character count */
4898 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4899 PyErr_NoMemory();
4900 return NULL;
4901 }
4902 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4903 if (!unicode)
4904 return NULL;
4905
4906 /* Unpack UTF-8 encoded data */
4907 p = unicode;
4908 e = s + size;
4909 while (s < e) {
4910 Py_UCS4 ch = (unsigned char)*s;
4911
4912 if (ch < 0x80) {
4913 *p++ = (wchar_t)ch;
4914 s++;
4915 continue;
4916 }
4917
4918 n = utf8_code_length[ch];
4919 if (s + n > e) {
4920 goto surrogateescape;
4921 }
4922
4923 switch (n) {
4924 case 0:
4925 case 1:
4926 goto surrogateescape;
4927
4928 case 2:
4929 if ((s[1] & 0xc0) != 0x80)
4930 goto surrogateescape;
4931 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4932 assert ((ch > 0x007F) && (ch <= 0x07FF));
4933 *p++ = (wchar_t)ch;
4934 break;
4935
4936 case 3:
4937 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4938 will result in surrogates in range d800-dfff. Surrogates are
4939 not valid UTF-8 so they are rejected.
4940 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4941 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4942 if ((s[1] & 0xc0) != 0x80 ||
4943 (s[2] & 0xc0) != 0x80 ||
4944 ((unsigned char)s[0] == 0xE0 &&
4945 (unsigned char)s[1] < 0xA0) ||
4946 ((unsigned char)s[0] == 0xED &&
4947 (unsigned char)s[1] > 0x9F)) {
4948
4949 goto surrogateescape;
4950 }
4951 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4952 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004953 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004954 break;
4955
4956 case 4:
4957 if ((s[1] & 0xc0) != 0x80 ||
4958 (s[2] & 0xc0) != 0x80 ||
4959 (s[3] & 0xc0) != 0x80 ||
4960 ((unsigned char)s[0] == 0xF0 &&
4961 (unsigned char)s[1] < 0x90) ||
4962 ((unsigned char)s[0] == 0xF4 &&
4963 (unsigned char)s[1] > 0x8F)) {
4964 goto surrogateescape;
4965 }
4966 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4967 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004968 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004969
4970#if SIZEOF_WCHAR_T == 4
4971 *p++ = (wchar_t)ch;
4972#else
4973 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004974 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4975 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004976#endif
4977 break;
4978 }
4979 s += n;
4980 continue;
4981
4982 surrogateescape:
4983 *p++ = 0xDC00 + ch;
4984 s++;
4985 }
4986 *p = L'\0';
4987 return unicode;
4988}
4989
4990#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004992/* Primary internal function which creates utf8 encoded bytes objects.
4993
4994 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004995 and allocate exactly as much space needed at the end. Else allocate the
4996 maximum possible needed (4 result bytes per Unicode character), and return
4997 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004998*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004999PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005000_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001{
Victor Stinner6099a032011-12-18 14:22:26 +01005002 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005003 void *data;
5004 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005006 if (!PyUnicode_Check(unicode)) {
5007 PyErr_BadArgument();
5008 return NULL;
5009 }
5010
5011 if (PyUnicode_READY(unicode) == -1)
5012 return NULL;
5013
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005014 if (PyUnicode_UTF8(unicode))
5015 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5016 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005017
5018 kind = PyUnicode_KIND(unicode);
5019 data = PyUnicode_DATA(unicode);
5020 size = PyUnicode_GET_LENGTH(unicode);
5021
Benjamin Petersonead6b532011-12-20 17:23:42 -06005022 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005023 default:
5024 assert(0);
5025 case PyUnicode_1BYTE_KIND:
5026 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5027 assert(!PyUnicode_IS_ASCII(unicode));
5028 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5029 case PyUnicode_2BYTE_KIND:
5030 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5031 case PyUnicode_4BYTE_KIND:
5032 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034}
5035
Alexander Belopolsky40018472011-02-26 01:02:56 +00005036PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5038 Py_ssize_t size,
5039 const char *errors)
5040{
5041 PyObject *v, *unicode;
5042
5043 unicode = PyUnicode_FromUnicode(s, size);
5044 if (unicode == NULL)
5045 return NULL;
5046 v = _PyUnicode_AsUTF8String(unicode, errors);
5047 Py_DECREF(unicode);
5048 return v;
5049}
5050
5051PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005052PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005054 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055}
5056
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057/* --- UTF-32 Codec ------------------------------------------------------- */
5058
5059PyObject *
5060PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 Py_ssize_t size,
5062 const char *errors,
5063 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005064{
5065 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5066}
5067
5068PyObject *
5069PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 Py_ssize_t size,
5071 const char *errors,
5072 int *byteorder,
5073 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005074{
5075 const char *starts = s;
5076 Py_ssize_t startinpos;
5077 Py_ssize_t endinpos;
5078 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005079 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005080 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081 int bo = 0; /* assume native ordering by default */
5082 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083 /* Offsets from q for retrieving bytes in the right order. */
5084#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5085 int iorder[] = {0, 1, 2, 3};
5086#else
5087 int iorder[] = {3, 2, 1, 0};
5088#endif
5089 PyObject *errorHandler = NULL;
5090 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005091
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092 q = (unsigned char *)s;
5093 e = q + size;
5094
5095 if (byteorder)
5096 bo = *byteorder;
5097
5098 /* Check for BOM marks (U+FEFF) in the input and adjust current
5099 byte order setting accordingly. In native mode, the leading BOM
5100 mark is skipped, in all other modes, it is copied to the output
5101 stream as-is (giving a ZWNBSP character). */
5102 if (bo == 0) {
5103 if (size >= 4) {
5104 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 if (bom == 0x0000FEFF) {
5108 q += 4;
5109 bo = -1;
5110 }
5111 else if (bom == 0xFFFE0000) {
5112 q += 4;
5113 bo = 1;
5114 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005115#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 if (bom == 0x0000FEFF) {
5117 q += 4;
5118 bo = 1;
5119 }
5120 else if (bom == 0xFFFE0000) {
5121 q += 4;
5122 bo = -1;
5123 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005124#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005126 }
5127
5128 if (bo == -1) {
5129 /* force LE */
5130 iorder[0] = 0;
5131 iorder[1] = 1;
5132 iorder[2] = 2;
5133 iorder[3] = 3;
5134 }
5135 else if (bo == 1) {
5136 /* force BE */
5137 iorder[0] = 3;
5138 iorder[1] = 2;
5139 iorder[2] = 1;
5140 iorder[3] = 0;
5141 }
5142
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005143 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005144 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005145 if (!unicode)
5146 return NULL;
5147 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005148 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005149 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005150
Walter Dörwald41980ca2007-08-16 21:55:45 +00005151 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 Py_UCS4 ch;
5153 /* remaining bytes at the end? (size should be divisible by 4) */
5154 if (e-q<4) {
5155 if (consumed)
5156 break;
5157 errmsg = "truncated data";
5158 startinpos = ((const char *)q)-starts;
5159 endinpos = ((const char *)e)-starts;
5160 goto utf32Error;
5161 /* The remaining input chars are ignored if the callback
5162 chooses to skip the input */
5163 }
5164 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5165 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 if (ch >= 0x110000)
5168 {
5169 errmsg = "codepoint not in range(0x110000)";
5170 startinpos = ((const char *)q)-starts;
5171 endinpos = startinpos+4;
5172 goto utf32Error;
5173 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005174 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5175 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 q += 4;
5177 continue;
5178 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 if (unicode_decode_call_errorhandler(
5180 errors, &errorHandler,
5181 "utf32", errmsg,
5182 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005183 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005185 }
5186
5187 if (byteorder)
5188 *byteorder = bo;
5189
5190 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005192
5193 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005194 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005195 goto onError;
5196
5197 Py_XDECREF(errorHandler);
5198 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005199 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005202 Py_DECREF(unicode);
5203 Py_XDECREF(errorHandler);
5204 Py_XDECREF(exc);
5205 return NULL;
5206}
5207
5208PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005209_PyUnicode_EncodeUTF32(PyObject *str,
5210 const char *errors,
5211 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005212{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005213 int kind;
5214 void *data;
5215 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005216 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005217 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005218 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219 /* Offsets from p for storing byte pairs in the right order. */
5220#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5221 int iorder[] = {0, 1, 2, 3};
5222#else
5223 int iorder[] = {3, 2, 1, 0};
5224#endif
5225
Benjamin Peterson29060642009-01-31 22:14:21 +00005226#define STORECHAR(CH) \
5227 do { \
5228 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5229 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5230 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5231 p[iorder[0]] = (CH) & 0xff; \
5232 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 } while(0)
5234
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005235 if (!PyUnicode_Check(str)) {
5236 PyErr_BadArgument();
5237 return NULL;
5238 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005239 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005240 return NULL;
5241 kind = PyUnicode_KIND(str);
5242 data = PyUnicode_DATA(str);
5243 len = PyUnicode_GET_LENGTH(str);
5244
5245 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005246 bytesize = nsize * 4;
5247 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005249 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005250 if (v == NULL)
5251 return NULL;
5252
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005253 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005254 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005256 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005257 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005258
5259 if (byteorder == -1) {
5260 /* force LE */
5261 iorder[0] = 0;
5262 iorder[1] = 1;
5263 iorder[2] = 2;
5264 iorder[3] = 3;
5265 }
5266 else if (byteorder == 1) {
5267 /* force BE */
5268 iorder[0] = 3;
5269 iorder[1] = 2;
5270 iorder[2] = 1;
5271 iorder[3] = 0;
5272 }
5273
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005274 for (i = 0; i < len; i++)
5275 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005276
5277 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005278 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005279#undef STORECHAR
5280}
5281
Alexander Belopolsky40018472011-02-26 01:02:56 +00005282PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005283PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5284 Py_ssize_t size,
5285 const char *errors,
5286 int byteorder)
5287{
5288 PyObject *result;
5289 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5290 if (tmp == NULL)
5291 return NULL;
5292 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5293 Py_DECREF(tmp);
5294 return result;
5295}
5296
5297PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005298PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005299{
Victor Stinnerb960b342011-11-20 19:12:52 +01005300 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005301}
5302
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303/* --- UTF-16 Codec ------------------------------------------------------- */
5304
Tim Peters772747b2001-08-09 22:21:55 +00005305PyObject *
5306PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 Py_ssize_t size,
5308 const char *errors,
5309 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310{
Walter Dörwald69652032004-09-07 20:24:22 +00005311 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5312}
5313
Antoine Pitrouab868312009-01-10 15:40:25 +00005314/* Two masks for fast checking of whether a C 'long' may contain
5315 UTF16-encoded surrogate characters. This is an efficient heuristic,
5316 assuming that non-surrogate characters with a code point >= 0x8000 are
5317 rare in most input.
5318 FAST_CHAR_MASK is used when the input is in native byte ordering,
5319 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005320*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005321#if (SIZEOF_LONG == 8)
5322# define FAST_CHAR_MASK 0x8000800080008000L
5323# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5324#elif (SIZEOF_LONG == 4)
5325# define FAST_CHAR_MASK 0x80008000L
5326# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5327#else
5328# error C 'long' size should be either 4 or 8!
5329#endif
5330
Walter Dörwald69652032004-09-07 20:24:22 +00005331PyObject *
5332PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 Py_ssize_t size,
5334 const char *errors,
5335 int *byteorder,
5336 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005339 Py_ssize_t startinpos;
5340 Py_ssize_t endinpos;
5341 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005342 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005343 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005344 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005345 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005346 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005347 /* Offsets from q for retrieving byte pairs in the right order. */
5348#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5349 int ihi = 1, ilo = 0;
5350#else
5351 int ihi = 0, ilo = 1;
5352#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 PyObject *errorHandler = NULL;
5354 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355
5356 /* Note: size will always be longer than the resulting Unicode
5357 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005358 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 if (!unicode)
5360 return NULL;
5361 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005362 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005363 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364
Tim Peters772747b2001-08-09 22:21:55 +00005365 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005366 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367
5368 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005369 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005371 /* Check for BOM marks (U+FEFF) in the input and adjust current
5372 byte order setting accordingly. In native mode, the leading BOM
5373 mark is skipped, in all other modes, it is copied to the output
5374 stream as-is (giving a ZWNBSP character). */
5375 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005376 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005377 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005378#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 if (bom == 0xFEFF) {
5380 q += 2;
5381 bo = -1;
5382 }
5383 else if (bom == 0xFFFE) {
5384 q += 2;
5385 bo = 1;
5386 }
Tim Petersced69f82003-09-16 20:30:58 +00005387#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 if (bom == 0xFEFF) {
5389 q += 2;
5390 bo = 1;
5391 }
5392 else if (bom == 0xFFFE) {
5393 q += 2;
5394 bo = -1;
5395 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005396#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399
Tim Peters772747b2001-08-09 22:21:55 +00005400 if (bo == -1) {
5401 /* force LE */
5402 ihi = 1;
5403 ilo = 0;
5404 }
5405 else if (bo == 1) {
5406 /* force BE */
5407 ihi = 0;
5408 ilo = 1;
5409 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005410#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5411 native_ordering = ilo < ihi;
5412#else
5413 native_ordering = ilo > ihi;
5414#endif
Tim Peters772747b2001-08-09 22:21:55 +00005415
Antoine Pitrouab868312009-01-10 15:40:25 +00005416 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005417 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005418 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005419 /* First check for possible aligned read of a C 'long'. Unaligned
5420 reads are more expensive, better to defer to another iteration. */
5421 if (!((size_t) q & LONG_PTR_MASK)) {
5422 /* Fast path for runs of non-surrogate chars. */
5423 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005424 int kind = PyUnicode_KIND(unicode);
5425 void *data = PyUnicode_DATA(unicode);
5426 while (_q < aligned_end) {
5427 unsigned long block = * (unsigned long *) _q;
5428 unsigned short *pblock = (unsigned short*)&block;
5429 Py_UCS4 maxch;
5430 if (native_ordering) {
5431 /* Can use buffer directly */
5432 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005433 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005434 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005435 else {
5436 /* Need to byte-swap */
5437 unsigned char *_p = (unsigned char*)pblock;
5438 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005439 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005440 _p[0] = _q[1];
5441 _p[1] = _q[0];
5442 _p[2] = _q[3];
5443 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005444#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005445 _p[4] = _q[5];
5446 _p[5] = _q[4];
5447 _p[6] = _q[7];
5448 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005449#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005450 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005451 maxch = Py_MAX(pblock[0], pblock[1]);
5452#if SIZEOF_LONG == 8
5453 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5454#endif
5455 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5456 if (unicode_widen(&unicode, maxch) < 0)
5457 goto onError;
5458 kind = PyUnicode_KIND(unicode);
5459 data = PyUnicode_DATA(unicode);
5460 }
5461 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5462 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5463#if SIZEOF_LONG == 8
5464 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5465 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5466#endif
5467 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005468 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005469 q = _q;
5470 if (q >= e)
5471 break;
5472 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474
Benjamin Peterson14339b62009-01-31 16:36:08 +00005475 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005476
Victor Stinner551ac952011-11-29 22:58:13 +01005477 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005478 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5479 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 continue;
5481 }
5482
5483 /* UTF-16 code pair: */
5484 if (q > e) {
5485 errmsg = "unexpected end of data";
5486 startinpos = (((const char *)q) - 2) - starts;
5487 endinpos = ((const char *)e) + 1 - starts;
5488 goto utf16Error;
5489 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005490 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5491 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005493 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005494 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005495 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005496 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 continue;
5498 }
5499 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005500 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 startinpos = (((const char *)q)-4)-starts;
5502 endinpos = startinpos+2;
5503 goto utf16Error;
5504 }
5505
Benjamin Peterson14339b62009-01-31 16:36:08 +00005506 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 errmsg = "illegal encoding";
5508 startinpos = (((const char *)q)-2)-starts;
5509 endinpos = startinpos+2;
5510 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005511
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005514 errors,
5515 &errorHandler,
5516 "utf16", errmsg,
5517 &starts,
5518 (const char **)&e,
5519 &startinpos,
5520 &endinpos,
5521 &exc,
5522 (const char **)&q,
5523 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005524 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005527 /* remaining byte at the end? (size should be even) */
5528 if (e == q) {
5529 if (!consumed) {
5530 errmsg = "truncated data";
5531 startinpos = ((const char *)q) - starts;
5532 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005533 if (unicode_decode_call_errorhandler(
5534 errors,
5535 &errorHandler,
5536 "utf16", errmsg,
5537 &starts,
5538 (const char **)&e,
5539 &startinpos,
5540 &endinpos,
5541 &exc,
5542 (const char **)&q,
5543 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005544 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005545 goto onError;
5546 /* The remaining input chars are ignored if the callback
5547 chooses to skip the input */
5548 }
5549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550
5551 if (byteorder)
5552 *byteorder = bo;
5553
Walter Dörwald69652032004-09-07 20:24:22 +00005554 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005556
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005558 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 goto onError;
5560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 Py_XDECREF(errorHandler);
5562 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005563 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 Py_XDECREF(errorHandler);
5568 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 return NULL;
5570}
5571
Antoine Pitrouab868312009-01-10 15:40:25 +00005572#undef FAST_CHAR_MASK
5573#undef SWAPPED_FAST_CHAR_MASK
5574
Tim Peters772747b2001-08-09 22:21:55 +00005575PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005576_PyUnicode_EncodeUTF16(PyObject *str,
5577 const char *errors,
5578 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005580 int kind;
5581 void *data;
5582 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005583 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005584 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005585 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005586 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005587 /* Offsets from p for storing byte pairs in the right order. */
5588#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5589 int ihi = 1, ilo = 0;
5590#else
5591 int ihi = 0, ilo = 1;
5592#endif
5593
Benjamin Peterson29060642009-01-31 22:14:21 +00005594#define STORECHAR(CH) \
5595 do { \
5596 p[ihi] = ((CH) >> 8) & 0xff; \
5597 p[ilo] = (CH) & 0xff; \
5598 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005599 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005601 if (!PyUnicode_Check(str)) {
5602 PyErr_BadArgument();
5603 return NULL;
5604 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005605 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005606 return NULL;
5607 kind = PyUnicode_KIND(str);
5608 data = PyUnicode_DATA(str);
5609 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005610
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005611 pairs = 0;
5612 if (kind == PyUnicode_4BYTE_KIND)
5613 for (i = 0; i < len; i++)
5614 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5615 pairs++;
5616 /* 2 * (len + pairs + (byteorder == 0)) */
5617 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005620 bytesize = nsize * 2;
5621 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005623 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 if (v == NULL)
5625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005627 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005630 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005631 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005632
5633 if (byteorder == -1) {
5634 /* force LE */
5635 ihi = 1;
5636 ilo = 0;
5637 }
5638 else if (byteorder == 1) {
5639 /* force BE */
5640 ihi = 0;
5641 ilo = 1;
5642 }
5643
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005644 for (i = 0; i < len; i++) {
5645 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5646 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005648 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5649 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 }
Tim Peters772747b2001-08-09 22:21:55 +00005651 STORECHAR(ch);
5652 if (ch2)
5653 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005654 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005655
5656 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005657 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005658#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659}
5660
Alexander Belopolsky40018472011-02-26 01:02:56 +00005661PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005662PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5663 Py_ssize_t size,
5664 const char *errors,
5665 int byteorder)
5666{
5667 PyObject *result;
5668 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5669 if (tmp == NULL)
5670 return NULL;
5671 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5672 Py_DECREF(tmp);
5673 return result;
5674}
5675
5676PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005677PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005679 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680}
5681
5682/* --- Unicode Escape Codec ----------------------------------------------- */
5683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5685 if all the escapes in the string make it still a valid ASCII string.
5686 Returns -1 if any escapes were found which cause the string to
5687 pop out of ASCII range. Otherwise returns the length of the
5688 required buffer to hold the string.
5689 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005690static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005691length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5692{
5693 const unsigned char *p = (const unsigned char *)s;
5694 const unsigned char *end = p + size;
5695 Py_ssize_t length = 0;
5696
5697 if (size < 0)
5698 return -1;
5699
5700 for (; p < end; ++p) {
5701 if (*p > 127) {
5702 /* Non-ASCII */
5703 return -1;
5704 }
5705 else if (*p != '\\') {
5706 /* Normal character */
5707 ++length;
5708 }
5709 else {
5710 /* Backslash-escape, check next char */
5711 ++p;
5712 /* Escape sequence reaches till end of string or
5713 non-ASCII follow-up. */
5714 if (p >= end || *p > 127)
5715 return -1;
5716 switch (*p) {
5717 case '\n':
5718 /* backslash + \n result in zero characters */
5719 break;
5720 case '\\': case '\'': case '\"':
5721 case 'b': case 'f': case 't':
5722 case 'n': case 'r': case 'v': case 'a':
5723 ++length;
5724 break;
5725 case '0': case '1': case '2': case '3':
5726 case '4': case '5': case '6': case '7':
5727 case 'x': case 'u': case 'U': case 'N':
5728 /* these do not guarantee ASCII characters */
5729 return -1;
5730 default:
5731 /* count the backslash + the other character */
5732 length += 2;
5733 }
5734 }
5735 }
5736 return length;
5737}
5738
Fredrik Lundh06d12682001-01-24 07:59:11 +00005739static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005740
Alexander Belopolsky40018472011-02-26 01:02:56 +00005741PyObject *
5742PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005743 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005747 Py_ssize_t startinpos;
5748 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005749 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005750 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005752 char* message;
5753 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754 PyObject *errorHandler = NULL;
5755 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005756 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005757 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005758
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005760
5761 /* After length_of_escaped_ascii_string() there are two alternatives,
5762 either the string is pure ASCII with named escapes like \n, etc.
5763 and we determined it's exact size (common case)
5764 or it contains \x, \u, ... escape sequences. then we create a
5765 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 if (len >= 0) {
5767 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005768 if (!v)
5769 goto onError;
5770 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005771 }
5772 else {
5773 /* Escaped strings will always be longer than the resulting
5774 Unicode string, so we start with size here and then reduce the
5775 length after conversion to the true value.
5776 (but if the error callback returns a long replacement string
5777 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005778 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005779 if (!v)
5780 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005781 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005782 }
5783
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005785 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005788
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 while (s < end) {
5790 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005791 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005794 /* The only case in which i == ascii_length is a backslash
5795 followed by a newline. */
5796 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 /* Non-escape characters are interpreted as Unicode ordinals */
5799 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005800 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5801 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 continue;
5803 }
5804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 /* \ - Escapes */
5807 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005808 c = *s++;
5809 if (s > end)
5810 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005811
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005812 /* The only case in which i == ascii_length is a backslash
5813 followed by a newline. */
5814 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005815
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005816 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005819#define WRITECHAR(ch) \
5820 do { \
5821 if (unicode_putchar(&v, &i, ch) < 0) \
5822 goto onError; \
5823 }while(0)
5824
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005826 case '\\': WRITECHAR('\\'); break;
5827 case '\'': WRITECHAR('\''); break;
5828 case '\"': WRITECHAR('\"'); break;
5829 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005830 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005831 case 'f': WRITECHAR('\014'); break;
5832 case 't': WRITECHAR('\t'); break;
5833 case 'n': WRITECHAR('\n'); break;
5834 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005835 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005836 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005838 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 case '0': case '1': case '2': case '3':
5842 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005843 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005844 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005845 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005846 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005847 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005849 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 break;
5851
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 /* hex escapes */
5853 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005855 digits = 2;
5856 message = "truncated \\xXX escape";
5857 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005861 digits = 4;
5862 message = "truncated \\uXXXX escape";
5863 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005866 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005867 digits = 8;
5868 message = "truncated \\UXXXXXXXX escape";
5869 hexescape:
5870 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 if (s+digits>end) {
5872 endinpos = size;
5873 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 errors, &errorHandler,
5875 "unicodeescape", "end of string in escape sequence",
5876 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005877 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 goto onError;
5879 goto nextByte;
5880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005881 for (j = 0; j < digits; ++j) {
5882 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005883 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005884 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 errors, &errorHandler,
5887 "unicodeescape", message,
5888 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005889 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005890 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005891 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005892 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005893 }
5894 chr = (chr<<4) & ~0xF;
5895 if (c >= '0' && c <= '9')
5896 chr += c - '0';
5897 else if (c >= 'a' && c <= 'f')
5898 chr += 10 + c - 'a';
5899 else
5900 chr += 10 + c - 'A';
5901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005902 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005903 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 /* _decoding_error will have already written into the
5905 target buffer. */
5906 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005907 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005908 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005909 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005910 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005911 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005912 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 errors, &errorHandler,
5915 "unicodeescape", "illegal Unicode character",
5916 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005917 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005918 goto onError;
5919 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005920 break;
5921
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005923 case 'N':
5924 message = "malformed \\N character escape";
5925 if (ucnhash_CAPI == NULL) {
5926 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005927 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5928 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005929 if (ucnhash_CAPI == NULL)
5930 goto ucnhashError;
5931 }
5932 if (*s == '{') {
5933 const char *start = s+1;
5934 /* look for the closing brace */
5935 while (*s != '}' && s < end)
5936 s++;
5937 if (s > start && s < end && *s == '}') {
5938 /* found a name. look it up in the unicode database */
5939 message = "unknown Unicode character name";
5940 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005941 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005942 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005943 goto store;
5944 }
5945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 errors, &errorHandler,
5949 "unicodeescape", message,
5950 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005951 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005952 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005953 break;
5954
5955 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005956 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005957 message = "\\ at end of string";
5958 s--;
5959 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 errors, &errorHandler,
5962 "unicodeescape", message,
5963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005964 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005965 goto onError;
5966 }
5967 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005968 WRITECHAR('\\');
5969 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005970 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005971 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005974 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005976#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005977
Victor Stinner16e6a802011-12-12 13:24:15 +01005978 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005979 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005980 Py_XDECREF(errorHandler);
5981 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005982 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005983
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005985 PyErr_SetString(
5986 PyExc_UnicodeError,
5987 "\\N escapes not supported (can't load unicodedata module)"
5988 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005989 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005990 Py_XDECREF(errorHandler);
5991 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005992 return NULL;
5993
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 Py_XDECREF(errorHandler);
5997 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 return NULL;
5999}
6000
6001/* Return a Unicode-Escape string version of the Unicode object.
6002
6003 If quotes is true, the string is enclosed in u"" or u'' quotes as
6004 appropriate.
6005
6006*/
6007
Alexander Belopolsky40018472011-02-26 01:02:56 +00006008PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006011 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006012 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006014 int kind;
6015 void *data;
6016 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Thomas Wouters89f507f2006-12-13 04:49:30 +00006018 /* Initial allocation is based on the longest-possible unichr
6019 escape.
6020
6021 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6022 unichr, so in this case it's the longest unichr escape. In
6023 narrow (UTF-16) builds this is five chars per source unichr
6024 since there are two unichrs in the surrogate pair, so in narrow
6025 (UTF-16) builds it's not the longest unichr escape.
6026
6027 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6028 so in the narrow (UTF-16) build case it's the longest unichr
6029 escape.
6030 */
6031
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006032 if (!PyUnicode_Check(unicode)) {
6033 PyErr_BadArgument();
6034 return NULL;
6035 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006036 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006037 return NULL;
6038 len = PyUnicode_GET_LENGTH(unicode);
6039 kind = PyUnicode_KIND(unicode);
6040 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006041 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006042 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6043 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6044 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6045 }
6046
6047 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006048 return PyBytes_FromStringAndSize(NULL, 0);
6049
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006050 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006052
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006053 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006055 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 if (repr == NULL)
6058 return NULL;
6059
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006060 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006062 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006063 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006064
Walter Dörwald79e913e2007-05-12 11:08:06 +00006065 /* Escape backslashes */
6066 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 *p++ = '\\';
6068 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006069 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006070 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006071
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006072 /* Map 21-bit characters to '\U00xxxxxx' */
6073 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006074 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006075 *p++ = '\\';
6076 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006077 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6078 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6079 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6080 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6081 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6082 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6083 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6084 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006086 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006087
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006089 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 *p++ = '\\';
6091 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006092 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6093 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6094 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6095 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006097
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006098 /* Map special whitespace to '\t', \n', '\r' */
6099 else if (ch == '\t') {
6100 *p++ = '\\';
6101 *p++ = 't';
6102 }
6103 else if (ch == '\n') {
6104 *p++ = '\\';
6105 *p++ = 'n';
6106 }
6107 else if (ch == '\r') {
6108 *p++ = '\\';
6109 *p++ = 'r';
6110 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006111
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006112 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006113 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006115 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006116 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6117 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006118 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 /* Copy everything else as-is */
6121 else
6122 *p++ = (char) ch;
6123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006125 assert(p - PyBytes_AS_STRING(repr) > 0);
6126 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6127 return NULL;
6128 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129}
6130
Alexander Belopolsky40018472011-02-26 01:02:56 +00006131PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006132PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6133 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006135 PyObject *result;
6136 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6137 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139 result = PyUnicode_AsUnicodeEscapeString(tmp);
6140 Py_DECREF(tmp);
6141 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142}
6143
6144/* --- Raw Unicode Escape Codec ------------------------------------------- */
6145
Alexander Belopolsky40018472011-02-26 01:02:56 +00006146PyObject *
6147PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006148 Py_ssize_t size,
6149 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006152 Py_ssize_t startinpos;
6153 Py_ssize_t endinpos;
6154 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006155 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 const char *end;
6157 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 PyObject *errorHandler = NULL;
6159 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006160
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 /* Escaped strings will always be longer than the resulting
6162 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 length after conversion to the true value. (But decoding error
6164 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006165 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006169 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006170 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 end = s + size;
6172 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 unsigned char c;
6174 Py_UCS4 x;
6175 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006176 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 /* Non-escape characters are interpreted as Unicode ordinals */
6179 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006180 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6181 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006183 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 startinpos = s-starts;
6185
6186 /* \u-escapes are only interpreted iff the number of leading
6187 backslashes if odd */
6188 bs = s;
6189 for (;s < end;) {
6190 if (*s != '\\')
6191 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006192 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6193 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 }
6195 if (((s - bs) & 1) == 0 ||
6196 s >= end ||
6197 (*s != 'u' && *s != 'U')) {
6198 continue;
6199 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006200 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 count = *s=='u' ? 4 : 8;
6202 s++;
6203
6204 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 for (x = 0, i = 0; i < count; ++i, ++s) {
6206 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006207 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 endinpos = s-starts;
6209 if (unicode_decode_call_errorhandler(
6210 errors, &errorHandler,
6211 "rawunicodeescape", "truncated \\uXXXX",
6212 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006213 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 goto onError;
6215 goto nextByte;
6216 }
6217 x = (x<<4) & ~0xF;
6218 if (c >= '0' && c <= '9')
6219 x += c - '0';
6220 else if (c >= 'a' && c <= 'f')
6221 x += 10 + c - 'a';
6222 else
6223 x += 10 + c - 'A';
6224 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006225 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006226 if (unicode_putchar(&v, &outpos, x) < 0)
6227 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006228 } else {
6229 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006230 if (unicode_decode_call_errorhandler(
6231 errors, &errorHandler,
6232 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006234 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006236 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 nextByte:
6238 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006240 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006242 Py_XDECREF(errorHandler);
6243 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006244 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006245
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006248 Py_XDECREF(errorHandler);
6249 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 return NULL;
6251}
6252
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006253
Alexander Belopolsky40018472011-02-26 01:02:56 +00006254PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006255PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006257 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 char *p;
6259 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006260 Py_ssize_t expandsize, pos;
6261 int kind;
6262 void *data;
6263 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006265 if (!PyUnicode_Check(unicode)) {
6266 PyErr_BadArgument();
6267 return NULL;
6268 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006269 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006270 return NULL;
6271 kind = PyUnicode_KIND(unicode);
6272 data = PyUnicode_DATA(unicode);
6273 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006274 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6275 bytes, and 1 byte characters 4. */
6276 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006277
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006280
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006281 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 if (repr == NULL)
6283 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006284 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006285 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006287 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006288 for (pos = 0; pos < len; pos++) {
6289 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 /* Map 32-bit characters to '\Uxxxxxxxx' */
6291 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006292 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006293 *p++ = '\\';
6294 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006295 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6296 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6297 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6298 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6299 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6300 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6301 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6302 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006305 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 *p++ = '\\';
6307 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006308 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6309 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6310 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6311 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 /* Copy everything else as-is */
6314 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 *p++ = (char) ch;
6316 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006317
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318 assert(p > q);
6319 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006320 return NULL;
6321 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322}
6323
Alexander Belopolsky40018472011-02-26 01:02:56 +00006324PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6326 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006328 PyObject *result;
6329 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6330 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006331 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006332 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6333 Py_DECREF(tmp);
6334 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006337/* --- Unicode Internal Codec ------------------------------------------- */
6338
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339PyObject *
6340_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006341 Py_ssize_t size,
6342 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006343{
6344 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006345 Py_ssize_t startinpos;
6346 Py_ssize_t endinpos;
6347 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006348 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006349 const char *end;
6350 const char *reason;
6351 PyObject *errorHandler = NULL;
6352 PyObject *exc = NULL;
6353
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006354 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006355 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006356 1))
6357 return NULL;
6358
Thomas Wouters89f507f2006-12-13 04:49:30 +00006359 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006360 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006361 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006363 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006364 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006365 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006366 end = s + size;
6367
6368 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006369 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006370 Py_UCS4 ch;
6371 /* We copy the raw representation one byte at a time because the
6372 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006373 ((char *) &uch)[0] = s[0];
6374 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006375#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006376 ((char *) &uch)[2] = s[2];
6377 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006378#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006379 ch = uch;
6380
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006381 /* We have to sanity check the raw data, otherwise doom looms for
6382 some malformed UCS-4 data. */
6383 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006384#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006385 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006386#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006387 end-s < Py_UNICODE_SIZE
6388 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006390 startinpos = s - starts;
6391 if (end-s < Py_UNICODE_SIZE) {
6392 endinpos = end-starts;
6393 reason = "truncated input";
6394 }
6395 else {
6396 endinpos = s - starts + Py_UNICODE_SIZE;
6397 reason = "illegal code point (> 0x10FFFF)";
6398 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006399 if (unicode_decode_call_errorhandler(
6400 errors, &errorHandler,
6401 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006402 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006403 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006404 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006405 continue;
6406 }
6407
6408 s += Py_UNICODE_SIZE;
6409#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006410 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006411 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006412 Py_UNICODE uch2;
6413 ((char *) &uch2)[0] = s[0];
6414 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006415 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006416 {
Victor Stinner551ac952011-11-29 22:58:13 +01006417 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006418 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006419 }
6420 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006421#endif
6422
6423 if (unicode_putchar(&v, &outpos, ch) < 0)
6424 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006425 }
6426
Victor Stinner16e6a802011-12-12 13:24:15 +01006427 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006428 goto onError;
6429 Py_XDECREF(errorHandler);
6430 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006431 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006432
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006434 Py_XDECREF(v);
6435 Py_XDECREF(errorHandler);
6436 Py_XDECREF(exc);
6437 return NULL;
6438}
6439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440/* --- Latin-1 Codec ------------------------------------------------------ */
6441
Alexander Belopolsky40018472011-02-26 01:02:56 +00006442PyObject *
6443PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006444 Py_ssize_t size,
6445 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006448 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449}
6450
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006452static void
6453make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006454 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006455 PyObject *unicode,
6456 Py_ssize_t startpos, Py_ssize_t endpos,
6457 const char *reason)
6458{
6459 if (*exceptionObject == NULL) {
6460 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006462 encoding, unicode, startpos, endpos, reason);
6463 }
6464 else {
6465 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6466 goto onError;
6467 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6468 goto onError;
6469 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6470 goto onError;
6471 return;
6472 onError:
6473 Py_DECREF(*exceptionObject);
6474 *exceptionObject = NULL;
6475 }
6476}
6477
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006479static void
6480raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006481 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006482 PyObject *unicode,
6483 Py_ssize_t startpos, Py_ssize_t endpos,
6484 const char *reason)
6485{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006486 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006487 encoding, unicode, startpos, endpos, reason);
6488 if (*exceptionObject != NULL)
6489 PyCodec_StrictErrors(*exceptionObject);
6490}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491
6492/* error handling callback helper:
6493 build arguments, call the callback and check the arguments,
6494 put the result into newpos and return the replacement string, which
6495 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006496static PyObject *
6497unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006498 PyObject **errorHandler,
6499 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006501 Py_ssize_t startpos, Py_ssize_t endpos,
6502 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006503{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006504 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506 PyObject *restuple;
6507 PyObject *resunicode;
6508
6509 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006511 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 }
6514
Benjamin Petersonbac79492012-01-14 13:34:47 -05006515 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 return NULL;
6517 len = PyUnicode_GET_LENGTH(unicode);
6518
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006519 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006521 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523
6524 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006528 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006529 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 Py_DECREF(restuple);
6531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006532 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006533 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 &resunicode, newpos)) {
6535 Py_DECREF(restuple);
6536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006537 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006538 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6539 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6540 Py_DECREF(restuple);
6541 return NULL;
6542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 *newpos = len + *newpos;
6545 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6547 Py_DECREF(restuple);
6548 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006549 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 Py_INCREF(resunicode);
6551 Py_DECREF(restuple);
6552 return resunicode;
6553}
6554
Alexander Belopolsky40018472011-02-26 01:02:56 +00006555static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006557 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006558 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 /* input state */
6561 Py_ssize_t pos=0, size;
6562 int kind;
6563 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006564 /* output object */
6565 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 /* pointer into the output */
6567 char *str;
6568 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006570 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6571 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572 PyObject *errorHandler = NULL;
6573 PyObject *exc = NULL;
6574 /* the following variable is used for caching string comparisons
6575 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6576 int known_errorHandler = -1;
6577
Benjamin Petersonbac79492012-01-14 13:34:47 -05006578 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006579 return NULL;
6580 size = PyUnicode_GET_LENGTH(unicode);
6581 kind = PyUnicode_KIND(unicode);
6582 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006583 /* allocate enough for a simple encoding without
6584 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006585 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006586 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006587 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006589 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006590 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006591 ressize = size;
6592
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006593 while (pos < size) {
6594 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 /* can we encode this? */
6597 if (c<limit) {
6598 /* no overflow check, because we know that the space is enough */
6599 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006600 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 Py_ssize_t requiredsize;
6604 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 Py_ssize_t collstart = pos;
6608 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006610 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 ++collend;
6612 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6613 if (known_errorHandler==-1) {
6614 if ((errors==NULL) || (!strcmp(errors, "strict")))
6615 known_errorHandler = 1;
6616 else if (!strcmp(errors, "replace"))
6617 known_errorHandler = 2;
6618 else if (!strcmp(errors, "ignore"))
6619 known_errorHandler = 3;
6620 else if (!strcmp(errors, "xmlcharrefreplace"))
6621 known_errorHandler = 4;
6622 else
6623 known_errorHandler = 0;
6624 }
6625 switch (known_errorHandler) {
6626 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006627 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 goto onError;
6629 case 2: /* replace */
6630 while (collstart++<collend)
6631 *str++ = '?'; /* fall through */
6632 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 break;
6635 case 4: /* xmlcharrefreplace */
6636 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006637 /* determine replacement size */
6638 for (i = collstart, repsize = 0; i < collend; ++i) {
6639 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6640 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006648 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006652 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006653 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006655 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 if (requiredsize > ressize) {
6659 if (requiredsize<2*ressize)
6660 requiredsize = 2*ressize;
6661 if (_PyBytes_Resize(&res, requiredsize))
6662 goto onError;
6663 str = PyBytes_AS_STRING(res) + respos;
6664 ressize = requiredsize;
6665 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 /* generate replacement */
6667 for (i = collstart; i < collend; ++i) {
6668 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 break;
6672 default:
6673 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 encoding, reason, unicode, &exc,
6675 collstart, collend, &newpos);
6676 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006677 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006679 if (PyBytes_Check(repunicode)) {
6680 /* Directly copy bytes result to output. */
6681 repsize = PyBytes_Size(repunicode);
6682 if (repsize > 1) {
6683 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006684 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006685 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6686 Py_DECREF(repunicode);
6687 goto onError;
6688 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006689 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006690 ressize += repsize-1;
6691 }
6692 memcpy(str, PyBytes_AsString(repunicode), repsize);
6693 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006694 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006695 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006696 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006697 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 /* need more space? (at least enough for what we
6699 have+the replacement+the rest of the string, so
6700 we won't have to check space for encodable characters) */
6701 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 repsize = PyUnicode_GET_LENGTH(repunicode);
6703 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 if (requiredsize > ressize) {
6705 if (requiredsize<2*ressize)
6706 requiredsize = 2*ressize;
6707 if (_PyBytes_Resize(&res, requiredsize)) {
6708 Py_DECREF(repunicode);
6709 goto onError;
6710 }
6711 str = PyBytes_AS_STRING(res) + respos;
6712 ressize = requiredsize;
6713 }
6714 /* check if there is anything unencodable in the replacement
6715 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006716 for (i = 0; repsize-->0; ++i, ++str) {
6717 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006719 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 Py_DECREF(repunicode);
6722 goto onError;
6723 }
6724 *str = (char)c;
6725 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006727 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006728 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006729 }
6730 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006731 /* Resize if we allocated to much */
6732 size = str - PyBytes_AS_STRING(res);
6733 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006734 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006735 if (_PyBytes_Resize(&res, size) < 0)
6736 goto onError;
6737 }
6738
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 Py_XDECREF(errorHandler);
6740 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006741 return res;
6742
6743 onError:
6744 Py_XDECREF(res);
6745 Py_XDECREF(errorHandler);
6746 Py_XDECREF(exc);
6747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748}
6749
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006750/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006751PyObject *
6752PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006753 Py_ssize_t size,
6754 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 PyObject *result;
6757 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6758 if (unicode == NULL)
6759 return NULL;
6760 result = unicode_encode_ucs1(unicode, errors, 256);
6761 Py_DECREF(unicode);
6762 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763}
6764
Alexander Belopolsky40018472011-02-26 01:02:56 +00006765PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006766_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767{
6768 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 PyErr_BadArgument();
6770 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006772 if (PyUnicode_READY(unicode) == -1)
6773 return NULL;
6774 /* Fast path: if it is a one-byte string, construct
6775 bytes object directly. */
6776 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6777 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6778 PyUnicode_GET_LENGTH(unicode));
6779 /* Non-Latin-1 characters present. Defer to above function to
6780 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006782}
6783
6784PyObject*
6785PyUnicode_AsLatin1String(PyObject *unicode)
6786{
6787 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788}
6789
6790/* --- 7-bit ASCII Codec -------------------------------------------------- */
6791
Alexander Belopolsky40018472011-02-26 01:02:56 +00006792PyObject *
6793PyUnicode_DecodeASCII(const char *s,
6794 Py_ssize_t size,
6795 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006797 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006798 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006799 int kind;
6800 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006801 Py_ssize_t startinpos;
6802 Py_ssize_t endinpos;
6803 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006805 int has_error;
6806 const unsigned char *p = (const unsigned char *)s;
6807 const unsigned char *end = p + size;
6808 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006809 PyObject *errorHandler = NULL;
6810 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006811
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006812 if (size == 0) {
6813 Py_INCREF(unicode_empty);
6814 return unicode_empty;
6815 }
6816
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006818 if (size == 1 && (unsigned char)s[0] < 128)
6819 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006820
Victor Stinner702c7342011-10-05 13:50:52 +02006821 has_error = 0;
6822 while (p < end && !has_error) {
6823 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6824 an explanation. */
6825 if (!((size_t) p & LONG_PTR_MASK)) {
6826 /* Help register allocation */
6827 register const unsigned char *_p = p;
6828 while (_p < aligned_end) {
6829 unsigned long value = *(unsigned long *) _p;
6830 if (value & ASCII_CHAR_MASK) {
6831 has_error = 1;
6832 break;
6833 }
6834 _p += SIZEOF_LONG;
6835 }
6836 if (_p == end)
6837 break;
6838 if (has_error)
6839 break;
6840 p = _p;
6841 }
6842 if (*p & 0x80) {
6843 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006844 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006845 }
6846 else {
6847 ++p;
6848 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006849 }
Victor Stinner702c7342011-10-05 13:50:52 +02006850 if (!has_error)
6851 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006853 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006857 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006858 kind = PyUnicode_KIND(v);
6859 data = PyUnicode_DATA(v);
6860 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861 e = s + size;
6862 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 register unsigned char c = (unsigned char)*s;
6864 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006865 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 ++s;
6867 }
6868 else {
6869 startinpos = s-starts;
6870 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 if (unicode_decode_call_errorhandler(
6872 errors, &errorHandler,
6873 "ascii", "ordinal not in range(128)",
6874 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006875 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006877 kind = PyUnicode_KIND(v);
6878 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006881 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006883 Py_XDECREF(errorHandler);
6884 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006885 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006886 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006887
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890 Py_XDECREF(errorHandler);
6891 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 return NULL;
6893}
6894
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006895/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006896PyObject *
6897PyUnicode_EncodeASCII(const Py_UNICODE *p,
6898 Py_ssize_t size,
6899 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006901 PyObject *result;
6902 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6903 if (unicode == NULL)
6904 return NULL;
6905 result = unicode_encode_ucs1(unicode, errors, 128);
6906 Py_DECREF(unicode);
6907 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908}
6909
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006911_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
6913 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 PyErr_BadArgument();
6915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006917 if (PyUnicode_READY(unicode) == -1)
6918 return NULL;
6919 /* Fast path: if it is an ASCII-only string, construct bytes object
6920 directly. Else defer to above function to raise the exception. */
6921 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6922 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6923 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006924 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006925}
6926
6927PyObject *
6928PyUnicode_AsASCIIString(PyObject *unicode)
6929{
6930 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931}
6932
Victor Stinner99b95382011-07-04 14:23:54 +02006933#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006934
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006935/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006936
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006937#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938#define NEED_RETRY
6939#endif
6940
Victor Stinner3a50e702011-10-18 21:21:00 +02006941#ifndef WC_ERR_INVALID_CHARS
6942# define WC_ERR_INVALID_CHARS 0x0080
6943#endif
6944
6945static char*
6946code_page_name(UINT code_page, PyObject **obj)
6947{
6948 *obj = NULL;
6949 if (code_page == CP_ACP)
6950 return "mbcs";
6951 if (code_page == CP_UTF7)
6952 return "CP_UTF7";
6953 if (code_page == CP_UTF8)
6954 return "CP_UTF8";
6955
6956 *obj = PyBytes_FromFormat("cp%u", code_page);
6957 if (*obj == NULL)
6958 return NULL;
6959 return PyBytes_AS_STRING(*obj);
6960}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961
Alexander Belopolsky40018472011-02-26 01:02:56 +00006962static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006963is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964{
6965 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006966 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006967
Victor Stinner3a50e702011-10-18 21:21:00 +02006968 if (!IsDBCSLeadByteEx(code_page, *curr))
6969 return 0;
6970
6971 prev = CharPrevExA(code_page, s, curr, 0);
6972 if (prev == curr)
6973 return 1;
6974 /* FIXME: This code is limited to "true" double-byte encodings,
6975 as it assumes an incomplete character consists of a single
6976 byte. */
6977 if (curr - prev == 2)
6978 return 1;
6979 if (!IsDBCSLeadByteEx(code_page, *prev))
6980 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006981 return 0;
6982}
6983
Victor Stinner3a50e702011-10-18 21:21:00 +02006984static DWORD
6985decode_code_page_flags(UINT code_page)
6986{
6987 if (code_page == CP_UTF7) {
6988 /* The CP_UTF7 decoder only supports flags=0 */
6989 return 0;
6990 }
6991 else
6992 return MB_ERR_INVALID_CHARS;
6993}
6994
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006995/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 * Decode a byte string from a Windows code page into unicode object in strict
6997 * mode.
6998 *
6999 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7000 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007002static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007003decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007004 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 const char *in,
7006 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007{
Victor Stinner3a50e702011-10-18 21:21:00 +02007008 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007009 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007010 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007011
7012 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007013 assert(insize > 0);
7014 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7015 if (outsize <= 0)
7016 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007017
7018 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007020 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007021 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 if (*v == NULL)
7023 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007024 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007025 }
7026 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007027 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007029 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007031 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032 }
7033
7034 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007035 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7036 if (outsize <= 0)
7037 goto error;
7038 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007039
Victor Stinner3a50e702011-10-18 21:21:00 +02007040error:
7041 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7042 return -2;
7043 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007044 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045}
7046
Victor Stinner3a50e702011-10-18 21:21:00 +02007047/*
7048 * Decode a byte string from a code page into unicode object with an error
7049 * handler.
7050 *
7051 * Returns consumed size if succeed, or raise a WindowsError or
7052 * UnicodeDecodeError exception and returns -1 on error.
7053 */
7054static int
7055decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007056 PyObject **v,
7057 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 const char *errors)
7059{
7060 const char *startin = in;
7061 const char *endin = in + size;
7062 const DWORD flags = decode_code_page_flags(code_page);
7063 /* Ideally, we should get reason from FormatMessage. This is the Windows
7064 2000 English version of the message. */
7065 const char *reason = "No mapping for the Unicode character exists "
7066 "in the target code page.";
7067 /* each step cannot decode more than 1 character, but a character can be
7068 represented as a surrogate pair */
7069 wchar_t buffer[2], *startout, *out;
7070 int insize, outsize;
7071 PyObject *errorHandler = NULL;
7072 PyObject *exc = NULL;
7073 PyObject *encoding_obj = NULL;
7074 char *encoding;
7075 DWORD err;
7076 int ret = -1;
7077
7078 assert(size > 0);
7079
7080 encoding = code_page_name(code_page, &encoding_obj);
7081 if (encoding == NULL)
7082 return -1;
7083
7084 if (errors == NULL || strcmp(errors, "strict") == 0) {
7085 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7086 UnicodeDecodeError. */
7087 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7088 if (exc != NULL) {
7089 PyCodec_StrictErrors(exc);
7090 Py_CLEAR(exc);
7091 }
7092 goto error;
7093 }
7094
7095 if (*v == NULL) {
7096 /* Create unicode object */
7097 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7098 PyErr_NoMemory();
7099 goto error;
7100 }
Victor Stinnerab595942011-12-17 04:59:06 +01007101 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007102 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 if (*v == NULL)
7104 goto error;
7105 startout = PyUnicode_AS_UNICODE(*v);
7106 }
7107 else {
7108 /* Extend unicode object */
7109 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7110 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7111 PyErr_NoMemory();
7112 goto error;
7113 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007114 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 goto error;
7116 startout = PyUnicode_AS_UNICODE(*v) + n;
7117 }
7118
7119 /* Decode the byte string character per character */
7120 out = startout;
7121 while (in < endin)
7122 {
7123 /* Decode a character */
7124 insize = 1;
7125 do
7126 {
7127 outsize = MultiByteToWideChar(code_page, flags,
7128 in, insize,
7129 buffer, Py_ARRAY_LENGTH(buffer));
7130 if (outsize > 0)
7131 break;
7132 err = GetLastError();
7133 if (err != ERROR_NO_UNICODE_TRANSLATION
7134 && err != ERROR_INSUFFICIENT_BUFFER)
7135 {
7136 PyErr_SetFromWindowsErr(0);
7137 goto error;
7138 }
7139 insize++;
7140 }
7141 /* 4=maximum length of a UTF-8 sequence */
7142 while (insize <= 4 && (in + insize) <= endin);
7143
7144 if (outsize <= 0) {
7145 Py_ssize_t startinpos, endinpos, outpos;
7146
7147 startinpos = in - startin;
7148 endinpos = startinpos + 1;
7149 outpos = out - PyUnicode_AS_UNICODE(*v);
7150 if (unicode_decode_call_errorhandler(
7151 errors, &errorHandler,
7152 encoding, reason,
7153 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007154 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 {
7156 goto error;
7157 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007158 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 }
7160 else {
7161 in += insize;
7162 memcpy(out, buffer, outsize * sizeof(wchar_t));
7163 out += outsize;
7164 }
7165 }
7166
7167 /* write a NUL character at the end */
7168 *out = 0;
7169
7170 /* Extend unicode object */
7171 outsize = out - startout;
7172 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007173 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007175 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007176
7177error:
7178 Py_XDECREF(encoding_obj);
7179 Py_XDECREF(errorHandler);
7180 Py_XDECREF(exc);
7181 return ret;
7182}
7183
Victor Stinner3a50e702011-10-18 21:21:00 +02007184static PyObject *
7185decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007186 const char *s, Py_ssize_t size,
7187 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007188{
Victor Stinner76a31a62011-11-04 00:05:13 +01007189 PyObject *v = NULL;
7190 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 if (code_page < 0) {
7193 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7194 return NULL;
7195 }
7196
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199
Victor Stinner76a31a62011-11-04 00:05:13 +01007200 do
7201 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007203 if (size > INT_MAX) {
7204 chunk_size = INT_MAX;
7205 final = 0;
7206 done = 0;
7207 }
7208 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007210 {
7211 chunk_size = (int)size;
7212 final = (consumed == NULL);
7213 done = 1;
7214 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007215
Victor Stinner76a31a62011-11-04 00:05:13 +01007216 /* Skip trailing lead-byte unless 'final' is set */
7217 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7218 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007219
Victor Stinner76a31a62011-11-04 00:05:13 +01007220 if (chunk_size == 0 && done) {
7221 if (v != NULL)
7222 break;
7223 Py_INCREF(unicode_empty);
7224 return unicode_empty;
7225 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007226
Victor Stinner76a31a62011-11-04 00:05:13 +01007227
7228 converted = decode_code_page_strict(code_page, &v,
7229 s, chunk_size);
7230 if (converted == -2)
7231 converted = decode_code_page_errors(code_page, &v,
7232 s, chunk_size,
7233 errors);
7234 assert(converted != 0);
7235
7236 if (converted < 0) {
7237 Py_XDECREF(v);
7238 return NULL;
7239 }
7240
7241 if (consumed)
7242 *consumed += converted;
7243
7244 s += converted;
7245 size -= converted;
7246 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007247
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007248 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007249}
7250
Alexander Belopolsky40018472011-02-26 01:02:56 +00007251PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007252PyUnicode_DecodeCodePageStateful(int code_page,
7253 const char *s,
7254 Py_ssize_t size,
7255 const char *errors,
7256 Py_ssize_t *consumed)
7257{
7258 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7259}
7260
7261PyObject *
7262PyUnicode_DecodeMBCSStateful(const char *s,
7263 Py_ssize_t size,
7264 const char *errors,
7265 Py_ssize_t *consumed)
7266{
7267 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7268}
7269
7270PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007271PyUnicode_DecodeMBCS(const char *s,
7272 Py_ssize_t size,
7273 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007274{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7276}
7277
Victor Stinner3a50e702011-10-18 21:21:00 +02007278static DWORD
7279encode_code_page_flags(UINT code_page, const char *errors)
7280{
7281 if (code_page == CP_UTF8) {
7282 if (winver.dwMajorVersion >= 6)
7283 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7284 and later */
7285 return WC_ERR_INVALID_CHARS;
7286 else
7287 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7288 return 0;
7289 }
7290 else if (code_page == CP_UTF7) {
7291 /* CP_UTF7 only supports flags=0 */
7292 return 0;
7293 }
7294 else {
7295 if (errors != NULL && strcmp(errors, "replace") == 0)
7296 return 0;
7297 else
7298 return WC_NO_BEST_FIT_CHARS;
7299 }
7300}
7301
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 * Encode a Unicode string to a Windows code page into a byte string in strict
7304 * mode.
7305 *
7306 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7307 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007309static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007310encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007311 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313{
Victor Stinner554f3f02010-06-16 23:33:54 +00007314 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 BOOL *pusedDefaultChar = &usedDefaultChar;
7316 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007317 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007318 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007319 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007320 const DWORD flags = encode_code_page_flags(code_page, NULL);
7321 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007322 /* Create a substring so that we can get the UTF-16 representation
7323 of just the slice under consideration. */
7324 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325
Martin v. Löwis3d325192011-11-04 18:23:06 +01007326 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007327
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007329 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007331 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007332
Victor Stinner2fc507f2011-11-04 20:06:39 +01007333 substring = PyUnicode_Substring(unicode, offset, offset+len);
7334 if (substring == NULL)
7335 return -1;
7336 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7337 if (p == NULL) {
7338 Py_DECREF(substring);
7339 return -1;
7340 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007341
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007342 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 outsize = WideCharToMultiByte(code_page, flags,
7344 p, size,
7345 NULL, 0,
7346 NULL, pusedDefaultChar);
7347 if (outsize <= 0)
7348 goto error;
7349 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007350 if (pusedDefaultChar && *pusedDefaultChar) {
7351 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007353 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007354
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007358 if (*outbytes == NULL) {
7359 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007361 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007362 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363 }
7364 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 const Py_ssize_t n = PyBytes_Size(*outbytes);
7367 if (outsize > PY_SSIZE_T_MAX - n) {
7368 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007369 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007372 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7373 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007375 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377 }
7378
7379 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 outsize = WideCharToMultiByte(code_page, flags,
7381 p, size,
7382 out, outsize,
7383 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007384 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 if (outsize <= 0)
7386 goto error;
7387 if (pusedDefaultChar && *pusedDefaultChar)
7388 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007389 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007390
Victor Stinner3a50e702011-10-18 21:21:00 +02007391error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007392 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7394 return -2;
7395 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007396 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007397}
7398
Victor Stinner3a50e702011-10-18 21:21:00 +02007399/*
7400 * Encode a Unicode string to a Windows code page into a byte string using a
7401 * error handler.
7402 *
7403 * Returns consumed characters if succeed, or raise a WindowsError and returns
7404 * -1 on other error.
7405 */
7406static int
7407encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007408 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007409 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007410{
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007412 Py_ssize_t pos = unicode_offset;
7413 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 /* Ideally, we should get reason from FormatMessage. This is the Windows
7415 2000 English version of the message. */
7416 const char *reason = "invalid character";
7417 /* 4=maximum length of a UTF-8 sequence */
7418 char buffer[4];
7419 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7420 Py_ssize_t outsize;
7421 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 PyObject *errorHandler = NULL;
7423 PyObject *exc = NULL;
7424 PyObject *encoding_obj = NULL;
7425 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007426 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 PyObject *rep;
7428 int ret = -1;
7429
7430 assert(insize > 0);
7431
7432 encoding = code_page_name(code_page, &encoding_obj);
7433 if (encoding == NULL)
7434 return -1;
7435
7436 if (errors == NULL || strcmp(errors, "strict") == 0) {
7437 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7438 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007439 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 if (exc != NULL) {
7441 PyCodec_StrictErrors(exc);
7442 Py_DECREF(exc);
7443 }
7444 Py_XDECREF(encoding_obj);
7445 return -1;
7446 }
7447
7448 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7449 pusedDefaultChar = &usedDefaultChar;
7450 else
7451 pusedDefaultChar = NULL;
7452
7453 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7454 PyErr_NoMemory();
7455 goto error;
7456 }
7457 outsize = insize * Py_ARRAY_LENGTH(buffer);
7458
7459 if (*outbytes == NULL) {
7460 /* Create string object */
7461 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7462 if (*outbytes == NULL)
7463 goto error;
7464 out = PyBytes_AS_STRING(*outbytes);
7465 }
7466 else {
7467 /* Extend string object */
7468 Py_ssize_t n = PyBytes_Size(*outbytes);
7469 if (n > PY_SSIZE_T_MAX - outsize) {
7470 PyErr_NoMemory();
7471 goto error;
7472 }
7473 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7474 goto error;
7475 out = PyBytes_AS_STRING(*outbytes) + n;
7476 }
7477
7478 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007479 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7482 wchar_t chars[2];
7483 int charsize;
7484 if (ch < 0x10000) {
7485 chars[0] = (wchar_t)ch;
7486 charsize = 1;
7487 }
7488 else {
7489 ch -= 0x10000;
7490 chars[0] = 0xd800 + (ch >> 10);
7491 chars[1] = 0xdc00 + (ch & 0x3ff);
7492 charsize = 2;
7493 }
7494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 buffer, Py_ARRAY_LENGTH(buffer),
7498 NULL, pusedDefaultChar);
7499 if (outsize > 0) {
7500 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7501 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007502 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 memcpy(out, buffer, outsize);
7504 out += outsize;
7505 continue;
7506 }
7507 }
7508 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7509 PyErr_SetFromWindowsErr(0);
7510 goto error;
7511 }
7512
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 rep = unicode_encode_call_errorhandler(
7514 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007515 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007516 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007517 if (rep == NULL)
7518 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007519 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007520
7521 if (PyBytes_Check(rep)) {
7522 outsize = PyBytes_GET_SIZE(rep);
7523 if (outsize != 1) {
7524 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7525 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7526 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7527 Py_DECREF(rep);
7528 goto error;
7529 }
7530 out = PyBytes_AS_STRING(*outbytes) + offset;
7531 }
7532 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7533 out += outsize;
7534 }
7535 else {
7536 Py_ssize_t i;
7537 enum PyUnicode_Kind kind;
7538 void *data;
7539
Benjamin Petersonbac79492012-01-14 13:34:47 -05007540 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 Py_DECREF(rep);
7542 goto error;
7543 }
7544
7545 outsize = PyUnicode_GET_LENGTH(rep);
7546 if (outsize != 1) {
7547 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7548 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7549 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7550 Py_DECREF(rep);
7551 goto error;
7552 }
7553 out = PyBytes_AS_STRING(*outbytes) + offset;
7554 }
7555 kind = PyUnicode_KIND(rep);
7556 data = PyUnicode_DATA(rep);
7557 for (i=0; i < outsize; i++) {
7558 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7559 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007560 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007561 encoding, unicode,
7562 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 "unable to encode error handler result to ASCII");
7564 Py_DECREF(rep);
7565 goto error;
7566 }
7567 *out = (unsigned char)ch;
7568 out++;
7569 }
7570 }
7571 Py_DECREF(rep);
7572 }
7573 /* write a NUL byte */
7574 *out = 0;
7575 outsize = out - PyBytes_AS_STRING(*outbytes);
7576 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7577 if (_PyBytes_Resize(outbytes, outsize) < 0)
7578 goto error;
7579 ret = 0;
7580
7581error:
7582 Py_XDECREF(encoding_obj);
7583 Py_XDECREF(errorHandler);
7584 Py_XDECREF(exc);
7585 return ret;
7586}
7587
Victor Stinner3a50e702011-10-18 21:21:00 +02007588static PyObject *
7589encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007590 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 const char *errors)
7592{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007593 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007595 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007596 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007597
Benjamin Petersonbac79492012-01-14 13:34:47 -05007598 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007599 return NULL;
7600 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007601
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 if (code_page < 0) {
7603 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7604 return NULL;
7605 }
7606
Martin v. Löwis3d325192011-11-04 18:23:06 +01007607 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007608 return PyBytes_FromStringAndSize(NULL, 0);
7609
Victor Stinner7581cef2011-11-03 22:32:33 +01007610 offset = 0;
7611 do
7612 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007613#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007614 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007615 chunks. */
7616 if (len > INT_MAX/2) {
7617 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007618 done = 0;
7619 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007620 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007621#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007622 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007623 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007624 done = 1;
7625 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007626
Victor Stinner76a31a62011-11-04 00:05:13 +01007627 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007628 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007629 errors);
7630 if (ret == -2)
7631 ret = encode_code_page_errors(code_page, &outbytes,
7632 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007633 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007634 if (ret < 0) {
7635 Py_XDECREF(outbytes);
7636 return NULL;
7637 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007638
Victor Stinner7581cef2011-11-03 22:32:33 +01007639 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007640 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007641 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007642
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 return outbytes;
7644}
7645
7646PyObject *
7647PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7648 Py_ssize_t size,
7649 const char *errors)
7650{
Victor Stinner7581cef2011-11-03 22:32:33 +01007651 PyObject *unicode, *res;
7652 unicode = PyUnicode_FromUnicode(p, size);
7653 if (unicode == NULL)
7654 return NULL;
7655 res = encode_code_page(CP_ACP, unicode, errors);
7656 Py_DECREF(unicode);
7657 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007658}
7659
7660PyObject *
7661PyUnicode_EncodeCodePage(int code_page,
7662 PyObject *unicode,
7663 const char *errors)
7664{
Victor Stinner7581cef2011-11-03 22:32:33 +01007665 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007666}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007667
Alexander Belopolsky40018472011-02-26 01:02:56 +00007668PyObject *
7669PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007670{
7671 if (!PyUnicode_Check(unicode)) {
7672 PyErr_BadArgument();
7673 return NULL;
7674 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007675 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007676}
7677
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007678#undef NEED_RETRY
7679
Victor Stinner99b95382011-07-04 14:23:54 +02007680#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007681
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682/* --- Character Mapping Codec -------------------------------------------- */
7683
Alexander Belopolsky40018472011-02-26 01:02:56 +00007684PyObject *
7685PyUnicode_DecodeCharmap(const char *s,
7686 Py_ssize_t size,
7687 PyObject *mapping,
7688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007690 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007691 Py_ssize_t startinpos;
7692 Py_ssize_t endinpos;
7693 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007695 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007696 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697 PyObject *errorHandler = NULL;
7698 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007699
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 /* Default to Latin-1 */
7701 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007704 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007708 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007709 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007711 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007712 Py_ssize_t maplen;
7713 enum PyUnicode_Kind kind;
7714 void *data;
7715 Py_UCS4 x;
7716
Benjamin Petersonbac79492012-01-14 13:34:47 -05007717 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007718 return NULL;
7719
7720 maplen = PyUnicode_GET_LENGTH(mapping);
7721 data = PyUnicode_DATA(mapping);
7722 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 while (s < e) {
7724 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007727 x = PyUnicode_READ(kind, data, ch);
7728 else
7729 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007731 if (x == 0xfffe)
7732 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 startinpos = s-starts;
7735 endinpos = startinpos+1;
7736 if (unicode_decode_call_errorhandler(
7737 errors, &errorHandler,
7738 "charmap", "character maps to <undefined>",
7739 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007740 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 goto onError;
7742 }
7743 continue;
7744 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007745
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007746 if (unicode_putchar(&v, &outpos, x) < 0)
7747 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007749 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007750 }
7751 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 while (s < e) {
7753 unsigned char ch = *s;
7754 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007755
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7757 w = PyLong_FromLong((long)ch);
7758 if (w == NULL)
7759 goto onError;
7760 x = PyObject_GetItem(mapping, w);
7761 Py_DECREF(w);
7762 if (x == NULL) {
7763 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7764 /* No mapping found means: mapping is undefined. */
7765 PyErr_Clear();
7766 x = Py_None;
7767 Py_INCREF(x);
7768 } else
7769 goto onError;
7770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 /* Apply mapping */
7773 if (PyLong_Check(x)) {
7774 long value = PyLong_AS_LONG(x);
7775 if (value < 0 || value > 65535) {
7776 PyErr_SetString(PyExc_TypeError,
7777 "character mapping must be in range(65536)");
7778 Py_DECREF(x);
7779 goto onError;
7780 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007781 if (unicode_putchar(&v, &outpos, value) < 0)
7782 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 }
7784 else if (x == Py_None) {
7785 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 startinpos = s-starts;
7787 endinpos = startinpos+1;
7788 if (unicode_decode_call_errorhandler(
7789 errors, &errorHandler,
7790 "charmap", "character maps to <undefined>",
7791 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007792 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 Py_DECREF(x);
7794 goto onError;
7795 }
7796 Py_DECREF(x);
7797 continue;
7798 }
7799 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007800 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007801
Benjamin Petersonbac79492012-01-14 13:34:47 -05007802 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007803 goto onError;
7804 targetsize = PyUnicode_GET_LENGTH(x);
7805
7806 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007808 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007809 PyUnicode_READ_CHAR(x, 0)) < 0)
7810 goto onError;
7811 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 else if (targetsize > 1) {
7813 /* 1-n mapping */
7814 if (targetsize > extrachars) {
7815 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 Py_ssize_t needed = (targetsize - extrachars) + \
7817 (targetsize << 2);
7818 extrachars += needed;
7819 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007820 if (unicode_resize(&v,
7821 PyUnicode_GET_LENGTH(v) + needed) < 0)
7822 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 Py_DECREF(x);
7824 goto onError;
7825 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007827 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7828 goto onError;
7829 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7830 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 extrachars -= targetsize;
7832 }
7833 /* 1-0 mapping: skip the character */
7834 }
7835 else {
7836 /* wrong return value */
7837 PyErr_SetString(PyExc_TypeError,
7838 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007839 Py_DECREF(x);
7840 goto onError;
7841 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 Py_DECREF(x);
7843 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007846 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007847 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007848 Py_XDECREF(errorHandler);
7849 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007850 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007851
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007853 Py_XDECREF(errorHandler);
7854 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 Py_XDECREF(v);
7856 return NULL;
7857}
7858
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007859/* Charmap encoding: the lookup table */
7860
Alexander Belopolsky40018472011-02-26 01:02:56 +00007861struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 PyObject_HEAD
7863 unsigned char level1[32];
7864 int count2, count3;
7865 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007866};
7867
7868static PyObject*
7869encoding_map_size(PyObject *obj, PyObject* args)
7870{
7871 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007872 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007874}
7875
7876static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007877 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 PyDoc_STR("Return the size (in bytes) of this object") },
7879 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880};
7881
7882static void
7883encoding_map_dealloc(PyObject* o)
7884{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007885 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886}
7887
7888static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007889 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 "EncodingMap", /*tp_name*/
7891 sizeof(struct encoding_map), /*tp_basicsize*/
7892 0, /*tp_itemsize*/
7893 /* methods */
7894 encoding_map_dealloc, /*tp_dealloc*/
7895 0, /*tp_print*/
7896 0, /*tp_getattr*/
7897 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007898 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 0, /*tp_repr*/
7900 0, /*tp_as_number*/
7901 0, /*tp_as_sequence*/
7902 0, /*tp_as_mapping*/
7903 0, /*tp_hash*/
7904 0, /*tp_call*/
7905 0, /*tp_str*/
7906 0, /*tp_getattro*/
7907 0, /*tp_setattro*/
7908 0, /*tp_as_buffer*/
7909 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7910 0, /*tp_doc*/
7911 0, /*tp_traverse*/
7912 0, /*tp_clear*/
7913 0, /*tp_richcompare*/
7914 0, /*tp_weaklistoffset*/
7915 0, /*tp_iter*/
7916 0, /*tp_iternext*/
7917 encoding_map_methods, /*tp_methods*/
7918 0, /*tp_members*/
7919 0, /*tp_getset*/
7920 0, /*tp_base*/
7921 0, /*tp_dict*/
7922 0, /*tp_descr_get*/
7923 0, /*tp_descr_set*/
7924 0, /*tp_dictoffset*/
7925 0, /*tp_init*/
7926 0, /*tp_alloc*/
7927 0, /*tp_new*/
7928 0, /*tp_free*/
7929 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007930};
7931
7932PyObject*
7933PyUnicode_BuildEncodingMap(PyObject* string)
7934{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007935 PyObject *result;
7936 struct encoding_map *mresult;
7937 int i;
7938 int need_dict = 0;
7939 unsigned char level1[32];
7940 unsigned char level2[512];
7941 unsigned char *mlevel1, *mlevel2, *mlevel3;
7942 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007943 int kind;
7944 void *data;
7945 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007947 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007948 PyErr_BadArgument();
7949 return NULL;
7950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007951 kind = PyUnicode_KIND(string);
7952 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007953 memset(level1, 0xFF, sizeof level1);
7954 memset(level2, 0xFF, sizeof level2);
7955
7956 /* If there isn't a one-to-one mapping of NULL to \0,
7957 or if there are non-BMP characters, we need to use
7958 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007959 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007960 need_dict = 1;
7961 for (i = 1; i < 256; i++) {
7962 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 ch = PyUnicode_READ(kind, data, i);
7964 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007965 need_dict = 1;
7966 break;
7967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007968 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007969 /* unmapped character */
7970 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 l1 = ch >> 11;
7972 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973 if (level1[l1] == 0xFF)
7974 level1[l1] = count2++;
7975 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007976 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007977 }
7978
7979 if (count2 >= 0xFF || count3 >= 0xFF)
7980 need_dict = 1;
7981
7982 if (need_dict) {
7983 PyObject *result = PyDict_New();
7984 PyObject *key, *value;
7985 if (!result)
7986 return NULL;
7987 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007989 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007990 if (!key || !value)
7991 goto failed1;
7992 if (PyDict_SetItem(result, key, value) == -1)
7993 goto failed1;
7994 Py_DECREF(key);
7995 Py_DECREF(value);
7996 }
7997 return result;
7998 failed1:
7999 Py_XDECREF(key);
8000 Py_XDECREF(value);
8001 Py_DECREF(result);
8002 return NULL;
8003 }
8004
8005 /* Create a three-level trie */
8006 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8007 16*count2 + 128*count3 - 1);
8008 if (!result)
8009 return PyErr_NoMemory();
8010 PyObject_Init(result, &EncodingMapType);
8011 mresult = (struct encoding_map*)result;
8012 mresult->count2 = count2;
8013 mresult->count3 = count3;
8014 mlevel1 = mresult->level1;
8015 mlevel2 = mresult->level23;
8016 mlevel3 = mresult->level23 + 16*count2;
8017 memcpy(mlevel1, level1, 32);
8018 memset(mlevel2, 0xFF, 16*count2);
8019 memset(mlevel3, 0, 128*count3);
8020 count3 = 0;
8021 for (i = 1; i < 256; i++) {
8022 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008023 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024 /* unmapped character */
8025 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 o1 = PyUnicode_READ(kind, data, i)>>11;
8027 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028 i2 = 16*mlevel1[o1] + o2;
8029 if (mlevel2[i2] == 0xFF)
8030 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008031 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008032 i3 = 128*mlevel2[i2] + o3;
8033 mlevel3[i3] = i;
8034 }
8035 return result;
8036}
8037
8038static int
Victor Stinner22168992011-11-20 17:09:18 +01008039encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008040{
8041 struct encoding_map *map = (struct encoding_map*)mapping;
8042 int l1 = c>>11;
8043 int l2 = (c>>7) & 0xF;
8044 int l3 = c & 0x7F;
8045 int i;
8046
Victor Stinner22168992011-11-20 17:09:18 +01008047 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049 if (c == 0)
8050 return 0;
8051 /* level 1*/
8052 i = map->level1[l1];
8053 if (i == 0xFF) {
8054 return -1;
8055 }
8056 /* level 2*/
8057 i = map->level23[16*i+l2];
8058 if (i == 0xFF) {
8059 return -1;
8060 }
8061 /* level 3 */
8062 i = map->level23[16*map->count2 + 128*i + l3];
8063 if (i == 0) {
8064 return -1;
8065 }
8066 return i;
8067}
8068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008069/* Lookup the character ch in the mapping. If the character
8070 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008071 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008073charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074{
Christian Heimes217cfd12007-12-02 14:31:20 +00008075 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008076 PyObject *x;
8077
8078 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 x = PyObject_GetItem(mapping, w);
8081 Py_DECREF(w);
8082 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8084 /* No mapping found means: mapping is undefined. */
8085 PyErr_Clear();
8086 x = Py_None;
8087 Py_INCREF(x);
8088 return x;
8089 } else
8090 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008092 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008094 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 long value = PyLong_AS_LONG(x);
8096 if (value < 0 || value > 255) {
8097 PyErr_SetString(PyExc_TypeError,
8098 "character mapping must be in range(256)");
8099 Py_DECREF(x);
8100 return NULL;
8101 }
8102 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008104 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 /* wrong return value */
8108 PyErr_Format(PyExc_TypeError,
8109 "character mapping must return integer, bytes or None, not %.400s",
8110 x->ob_type->tp_name);
8111 Py_DECREF(x);
8112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 }
8114}
8115
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008117charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008118{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008119 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8120 /* exponentially overallocate to minimize reallocations */
8121 if (requiredsize < 2*outsize)
8122 requiredsize = 2*outsize;
8123 if (_PyBytes_Resize(outobj, requiredsize))
8124 return -1;
8125 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126}
8127
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008130} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008132 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 space is available. Return a new reference to the object that
8134 was put in the output buffer, or Py_None, if the mapping was undefined
8135 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008136 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008137static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008138charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008139 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 PyObject *rep;
8142 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008143 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144
Christian Heimes90aa7642007-12-19 02:45:37 +00008145 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 if (res == -1)
8149 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 if (outsize<requiredsize)
8151 if (charmapencode_resize(outobj, outpos, requiredsize))
8152 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008153 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 outstart[(*outpos)++] = (char)res;
8155 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 }
8157
8158 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 Py_DECREF(rep);
8163 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008164 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 if (PyLong_Check(rep)) {
8166 Py_ssize_t requiredsize = *outpos+1;
8167 if (outsize<requiredsize)
8168 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8169 Py_DECREF(rep);
8170 return enc_EXCEPTION;
8171 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008172 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008174 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 else {
8176 const char *repchars = PyBytes_AS_STRING(rep);
8177 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8178 Py_ssize_t requiredsize = *outpos+repsize;
8179 if (outsize<requiredsize)
8180 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8181 Py_DECREF(rep);
8182 return enc_EXCEPTION;
8183 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008184 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 memcpy(outstart + *outpos, repchars, repsize);
8186 *outpos += repsize;
8187 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008188 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189 Py_DECREF(rep);
8190 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191}
8192
8193/* handle an error in PyUnicode_EncodeCharmap
8194 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008195static int
8196charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008197 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008199 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008200 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008201{
8202 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008203 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008204 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008205 enum PyUnicode_Kind kind;
8206 void *data;
8207 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008208 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008209 Py_ssize_t collstartpos = *inpos;
8210 Py_ssize_t collendpos = *inpos+1;
8211 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 char *encoding = "charmap";
8213 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008215 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008216 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008217
Benjamin Petersonbac79492012-01-14 13:34:47 -05008218 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008219 return -1;
8220 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221 /* find all unencodable characters */
8222 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008224 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008225 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008226 val = encoding_map_lookup(ch, mapping);
8227 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 break;
8229 ++collendpos;
8230 continue;
8231 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008233 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8234 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 if (rep==NULL)
8236 return -1;
8237 else if (rep!=Py_None) {
8238 Py_DECREF(rep);
8239 break;
8240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 }
8244 /* cache callback name lookup
8245 * (if not done yet, i.e. it's the first error) */
8246 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 if ((errors==NULL) || (!strcmp(errors, "strict")))
8248 *known_errorHandler = 1;
8249 else if (!strcmp(errors, "replace"))
8250 *known_errorHandler = 2;
8251 else if (!strcmp(errors, "ignore"))
8252 *known_errorHandler = 3;
8253 else if (!strcmp(errors, "xmlcharrefreplace"))
8254 *known_errorHandler = 4;
8255 else
8256 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 }
8258 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008259 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008260 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 return -1;
8262 case 2: /* replace */
8263 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 x = charmapencode_output('?', mapping, res, respos);
8265 if (x==enc_EXCEPTION) {
8266 return -1;
8267 }
8268 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008269 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 return -1;
8271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 }
8273 /* fall through */
8274 case 3: /* ignore */
8275 *inpos = collendpos;
8276 break;
8277 case 4: /* xmlcharrefreplace */
8278 /* generate replacement (temporarily (mis)uses p) */
8279 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 char buffer[2+29+1+1];
8281 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008282 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 for (cp = buffer; *cp; ++cp) {
8284 x = charmapencode_output(*cp, mapping, res, respos);
8285 if (x==enc_EXCEPTION)
8286 return -1;
8287 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008288 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return -1;
8290 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008291 }
8292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008293 *inpos = collendpos;
8294 break;
8295 default:
8296 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008297 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008299 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008301 if (PyBytes_Check(repunicode)) {
8302 /* Directly copy bytes result to output. */
8303 Py_ssize_t outsize = PyBytes_Size(*res);
8304 Py_ssize_t requiredsize;
8305 repsize = PyBytes_Size(repunicode);
8306 requiredsize = *respos + repsize;
8307 if (requiredsize > outsize)
8308 /* Make room for all additional bytes. */
8309 if (charmapencode_resize(res, respos, requiredsize)) {
8310 Py_DECREF(repunicode);
8311 return -1;
8312 }
8313 memcpy(PyBytes_AsString(*res) + *respos,
8314 PyBytes_AsString(repunicode), repsize);
8315 *respos += repsize;
8316 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008317 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008318 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008319 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008320 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008321 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008322 Py_DECREF(repunicode);
8323 return -1;
8324 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008325 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008326 data = PyUnicode_DATA(repunicode);
8327 kind = PyUnicode_KIND(repunicode);
8328 for (index = 0; index < repsize; index++) {
8329 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8330 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008332 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 return -1;
8334 }
8335 else if (x==enc_FAILED) {
8336 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008337 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 return -1;
8339 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008340 }
8341 *inpos = newpos;
8342 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 }
8344 return 0;
8345}
8346
Alexander Belopolsky40018472011-02-26 01:02:56 +00008347PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008348_PyUnicode_EncodeCharmap(PyObject *unicode,
8349 PyObject *mapping,
8350 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 /* output object */
8353 PyObject *res = NULL;
8354 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008355 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008358 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 PyObject *errorHandler = NULL;
8360 PyObject *exc = NULL;
8361 /* the following variable is used for caching string comparisons
8362 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8363 * 3=ignore, 4=xmlcharrefreplace */
8364 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365
Benjamin Petersonbac79492012-01-14 13:34:47 -05008366 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008367 return NULL;
8368 size = PyUnicode_GET_LENGTH(unicode);
8369
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 /* Default to Latin-1 */
8371 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008372 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 /* allocate enough for a simple encoding without
8375 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008376 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 if (res == NULL)
8378 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008379 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008383 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008385 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 if (x==enc_EXCEPTION) /* error */
8387 goto onError;
8388 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 &exc,
8391 &known_errorHandler, &errorHandler, errors,
8392 &res, &respos)) {
8393 goto onError;
8394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008395 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 else
8397 /* done with this character => adjust input position */
8398 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008402 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008403 if (_PyBytes_Resize(&res, respos) < 0)
8404 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 Py_XDECREF(exc);
8407 Py_XDECREF(errorHandler);
8408 return res;
8409
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 Py_XDECREF(res);
8412 Py_XDECREF(exc);
8413 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 return NULL;
8415}
8416
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008417/* Deprecated */
8418PyObject *
8419PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8420 Py_ssize_t size,
8421 PyObject *mapping,
8422 const char *errors)
8423{
8424 PyObject *result;
8425 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8426 if (unicode == NULL)
8427 return NULL;
8428 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8429 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008430 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008431}
8432
Alexander Belopolsky40018472011-02-26 01:02:56 +00008433PyObject *
8434PyUnicode_AsCharmapString(PyObject *unicode,
8435 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436{
8437 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 PyErr_BadArgument();
8439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008441 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442}
8443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008445static void
8446make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008448 Py_ssize_t startpos, Py_ssize_t endpos,
8449 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 *exceptionObject = _PyUnicodeTranslateError_Create(
8453 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454 }
8455 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8457 goto onError;
8458 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8459 goto onError;
8460 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8461 goto onError;
8462 return;
8463 onError:
8464 Py_DECREF(*exceptionObject);
8465 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 }
8467}
8468
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008470static void
8471raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008473 Py_ssize_t startpos, Py_ssize_t endpos,
8474 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475{
8476 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480}
8481
8482/* error handling callback helper:
8483 build arguments, call the callback and check the arguments,
8484 put the result into newpos and return the replacement string, which
8485 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008486static PyObject *
8487unicode_translate_call_errorhandler(const char *errors,
8488 PyObject **errorHandler,
8489 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008491 Py_ssize_t startpos, Py_ssize_t endpos,
8492 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008494 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008496 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497 PyObject *restuple;
8498 PyObject *resunicode;
8499
8500 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504 }
8505
8506 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510
8511 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008516 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 Py_DECREF(restuple);
8518 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 }
8520 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 &resunicode, &i_newpos)) {
8522 Py_DECREF(restuple);
8523 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008525 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008527 else
8528 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8531 Py_DECREF(restuple);
8532 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008533 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 Py_INCREF(resunicode);
8535 Py_DECREF(restuple);
8536 return resunicode;
8537}
8538
8539/* Lookup the character ch in the mapping and put the result in result,
8540 which must be decrefed by the caller.
8541 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008542static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544{
Christian Heimes217cfd12007-12-02 14:31:20 +00008545 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 PyObject *x;
8547
8548 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 x = PyObject_GetItem(mapping, w);
8551 Py_DECREF(w);
8552 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8554 /* No mapping found means: use 1:1 mapping. */
8555 PyErr_Clear();
8556 *result = NULL;
8557 return 0;
8558 } else
8559 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 }
8561 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 *result = x;
8563 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008565 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 long value = PyLong_AS_LONG(x);
8567 long max = PyUnicode_GetMax();
8568 if (value < 0 || value > max) {
8569 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008570 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 Py_DECREF(x);
8572 return -1;
8573 }
8574 *result = x;
8575 return 0;
8576 }
8577 else if (PyUnicode_Check(x)) {
8578 *result = x;
8579 return 0;
8580 }
8581 else {
8582 /* wrong return value */
8583 PyErr_SetString(PyExc_TypeError,
8584 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008585 Py_DECREF(x);
8586 return -1;
8587 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588}
8589/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 if not reallocate and adjust various state variables.
8591 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008592static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008597 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 /* exponentially overallocate to minimize reallocations */
8599 if (requiredsize < 2 * oldsize)
8600 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8602 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605 }
8606 return 0;
8607}
8608/* lookup the character, put the result in the output string and adjust
8609 various state variables. Return a new reference to the object that
8610 was put in the output buffer in *result, or Py_None, if the mapping was
8611 undefined (in which case no character was written).
8612 The called must decref result.
8613 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8616 PyObject *mapping, Py_UCS4 **output,
8617 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008618 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8621 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 }
8627 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008629 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632 }
8633 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 Py_ssize_t repsize;
8635 if (PyUnicode_READY(*res) == -1)
8636 return -1;
8637 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 if (repsize==1) {
8639 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 }
8642 else if (repsize!=0) {
8643 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 Py_ssize_t requiredsize = *opos +
8645 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 Py_ssize_t i;
8648 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 for(i = 0; i < repsize; i++)
8651 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 }
8654 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656 return 0;
8657}
8658
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660_PyUnicode_TranslateCharmap(PyObject *input,
8661 PyObject *mapping,
8662 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 /* input object */
8665 char *idata;
8666 Py_ssize_t size, i;
8667 int kind;
8668 /* output buffer */
8669 Py_UCS4 *output = NULL;
8670 Py_ssize_t osize;
8671 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 char *reason = "character maps to <undefined>";
8675 PyObject *errorHandler = NULL;
8676 PyObject *exc = NULL;
8677 /* the following variable is used for caching string comparisons
8678 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8679 * 3=ignore, 4=xmlcharrefreplace */
8680 int known_errorHandler = -1;
8681
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 PyErr_BadArgument();
8684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 if (PyUnicode_READY(input) == -1)
8688 return NULL;
8689 idata = (char*)PyUnicode_DATA(input);
8690 kind = PyUnicode_KIND(input);
8691 size = PyUnicode_GET_LENGTH(input);
8692 i = 0;
8693
8694 if (size == 0) {
8695 Py_INCREF(input);
8696 return input;
8697 }
8698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 /* allocate enough for a simple 1:1 translation without
8700 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 osize = size;
8702 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8703 opos = 0;
8704 if (output == NULL) {
8705 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 /* try to encode it */
8711 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 if (charmaptranslate_output(input, i, mapping,
8713 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 Py_XDECREF(x);
8715 goto onError;
8716 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008717 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 else { /* untranslatable character */
8721 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8722 Py_ssize_t repsize;
8723 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 Py_ssize_t collstart = i;
8727 Py_ssize_t collend = i+1;
8728 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 while (collend < size) {
8732 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 goto onError;
8734 Py_XDECREF(x);
8735 if (x!=Py_None)
8736 break;
8737 ++collend;
8738 }
8739 /* cache callback name lookup
8740 * (if not done yet, i.e. it's the first error) */
8741 if (known_errorHandler==-1) {
8742 if ((errors==NULL) || (!strcmp(errors, "strict")))
8743 known_errorHandler = 1;
8744 else if (!strcmp(errors, "replace"))
8745 known_errorHandler = 2;
8746 else if (!strcmp(errors, "ignore"))
8747 known_errorHandler = 3;
8748 else if (!strcmp(errors, "xmlcharrefreplace"))
8749 known_errorHandler = 4;
8750 else
8751 known_errorHandler = 0;
8752 }
8753 switch (known_errorHandler) {
8754 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 raise_translate_exception(&exc, input, collstart,
8756 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 case 2: /* replace */
8759 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760 for (coll = collstart; coll<collend; coll++)
8761 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 /* fall through */
8763 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 break;
8766 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 /* generate replacement (temporarily (mis)uses i) */
8768 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 char buffer[2+29+1+1];
8770 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8772 if (charmaptranslate_makespace(&output, &osize,
8773 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 goto onError;
8775 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 break;
8780 default:
8781 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 reason, input, &exc,
8783 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008784 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008786 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008787 Py_DECREF(repunicode);
8788 goto onError;
8789 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 repsize = PyUnicode_GET_LENGTH(repunicode);
8792 if (charmaptranslate_makespace(&output, &osize,
8793 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 Py_DECREF(repunicode);
8795 goto onError;
8796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 for (uni2 = 0; repsize-->0; ++uni2)
8798 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8799 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008802 }
8803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8805 if (!res)
8806 goto onError;
8807 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008808 Py_XDECREF(exc);
8809 Py_XDECREF(errorHandler);
8810 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008814 Py_XDECREF(exc);
8815 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 return NULL;
8817}
8818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819/* Deprecated. Use PyUnicode_Translate instead. */
8820PyObject *
8821PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8822 Py_ssize_t size,
8823 PyObject *mapping,
8824 const char *errors)
8825{
8826 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8827 if (!unicode)
8828 return NULL;
8829 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8830}
8831
Alexander Belopolsky40018472011-02-26 01:02:56 +00008832PyObject *
8833PyUnicode_Translate(PyObject *str,
8834 PyObject *mapping,
8835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836{
8837 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008838
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 str = PyUnicode_FromObject(str);
8840 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 Py_DECREF(str);
8844 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008845
Benjamin Peterson29060642009-01-31 22:14:21 +00008846 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 Py_XDECREF(str);
8848 return NULL;
8849}
Tim Petersced69f82003-09-16 20:30:58 +00008850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008852fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853{
8854 /* No need to call PyUnicode_READY(self) because this function is only
8855 called as a callback from fixup() which does it already. */
8856 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8857 const int kind = PyUnicode_KIND(self);
8858 void *data = PyUnicode_DATA(self);
8859 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008860 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 Py_ssize_t i;
8862
8863 for (i = 0; i < len; ++i) {
8864 ch = PyUnicode_READ(kind, data, i);
8865 fixed = 0;
8866 if (ch > 127) {
8867 if (Py_UNICODE_ISSPACE(ch))
8868 fixed = ' ';
8869 else {
8870 const int decimal = Py_UNICODE_TODECIMAL(ch);
8871 if (decimal >= 0)
8872 fixed = '0' + decimal;
8873 }
8874 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008875 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 if (fixed > maxchar)
8877 maxchar = fixed;
8878 PyUnicode_WRITE(kind, data, i, fixed);
8879 }
8880 else if (ch > maxchar)
8881 maxchar = ch;
8882 }
8883 else if (ch > maxchar)
8884 maxchar = ch;
8885 }
8886
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008887 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888}
8889
8890PyObject *
8891_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8892{
8893 if (!PyUnicode_Check(unicode)) {
8894 PyErr_BadInternalCall();
8895 return NULL;
8896 }
8897 if (PyUnicode_READY(unicode) == -1)
8898 return NULL;
8899 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8900 /* If the string is already ASCII, just return the same string */
8901 Py_INCREF(unicode);
8902 return unicode;
8903 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008904 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905}
8906
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008907PyObject *
8908PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8909 Py_ssize_t length)
8910{
Victor Stinnerf0124502011-11-21 23:12:56 +01008911 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008912 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008913 Py_UCS4 maxchar;
8914 enum PyUnicode_Kind kind;
8915 void *data;
8916
8917 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008918 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008919 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008920 if (ch > 127) {
8921 int decimal = Py_UNICODE_TODECIMAL(ch);
8922 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008923 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008924 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008925 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008926 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008927
8928 /* Copy to a new string */
8929 decimal = PyUnicode_New(length, maxchar);
8930 if (decimal == NULL)
8931 return decimal;
8932 kind = PyUnicode_KIND(decimal);
8933 data = PyUnicode_DATA(decimal);
8934 /* Iterate over code points */
8935 for (i = 0; i < length; i++) {
8936 Py_UNICODE ch = s[i];
8937 if (ch > 127) {
8938 int decimal = Py_UNICODE_TODECIMAL(ch);
8939 if (decimal >= 0)
8940 ch = '0' + decimal;
8941 }
8942 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008944 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008945}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008946/* --- Decimal Encoder ---------------------------------------------------- */
8947
Alexander Belopolsky40018472011-02-26 01:02:56 +00008948int
8949PyUnicode_EncodeDecimal(Py_UNICODE *s,
8950 Py_ssize_t length,
8951 char *output,
8952 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008953{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008954 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008955 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008956 enum PyUnicode_Kind kind;
8957 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008958
8959 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 PyErr_BadArgument();
8961 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008962 }
8963
Victor Stinner42bf7752011-11-21 22:52:58 +01008964 unicode = PyUnicode_FromUnicode(s, length);
8965 if (unicode == NULL)
8966 return -1;
8967
Benjamin Petersonbac79492012-01-14 13:34:47 -05008968 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008969 Py_DECREF(unicode);
8970 return -1;
8971 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008972 kind = PyUnicode_KIND(unicode);
8973 data = PyUnicode_DATA(unicode);
8974
Victor Stinnerb84d7232011-11-22 01:50:07 +01008975 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008976 PyObject *exc;
8977 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008979 Py_ssize_t startpos;
8980
8981 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008982
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008984 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008985 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008987 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 decimal = Py_UNICODE_TODECIMAL(ch);
8989 if (decimal >= 0) {
8990 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008991 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 continue;
8993 }
8994 if (0 < ch && ch < 256) {
8995 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008996 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 continue;
8998 }
Victor Stinner6345be92011-11-25 20:09:01 +01008999
Victor Stinner42bf7752011-11-21 22:52:58 +01009000 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009001 exc = NULL;
9002 raise_encode_exception(&exc, "decimal", unicode,
9003 startpos, startpos+1,
9004 "invalid decimal Unicode string");
9005 Py_XDECREF(exc);
9006 Py_DECREF(unicode);
9007 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009008 }
9009 /* 0-terminate the output string */
9010 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009011 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009012 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009013}
9014
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015/* --- Helpers ------------------------------------------------------------ */
9016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009018any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 Py_ssize_t start,
9020 Py_ssize_t end)
9021{
9022 int kind1, kind2, kind;
9023 void *buf1, *buf2;
9024 Py_ssize_t len1, len2, result;
9025
9026 kind1 = PyUnicode_KIND(s1);
9027 kind2 = PyUnicode_KIND(s2);
9028 kind = kind1 > kind2 ? kind1 : kind2;
9029 buf1 = PyUnicode_DATA(s1);
9030 buf2 = PyUnicode_DATA(s2);
9031 if (kind1 != kind)
9032 buf1 = _PyUnicode_AsKind(s1, kind);
9033 if (!buf1)
9034 return -2;
9035 if (kind2 != kind)
9036 buf2 = _PyUnicode_AsKind(s2, kind);
9037 if (!buf2) {
9038 if (kind1 != kind) PyMem_Free(buf1);
9039 return -2;
9040 }
9041 len1 = PyUnicode_GET_LENGTH(s1);
9042 len2 = PyUnicode_GET_LENGTH(s2);
9043
Victor Stinner794d5672011-10-10 03:21:36 +02009044 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009045 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009046 case PyUnicode_1BYTE_KIND:
9047 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9048 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9049 else
9050 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9051 break;
9052 case PyUnicode_2BYTE_KIND:
9053 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9054 break;
9055 case PyUnicode_4BYTE_KIND:
9056 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9057 break;
9058 default:
9059 assert(0); result = -2;
9060 }
9061 }
9062 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009063 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009064 case PyUnicode_1BYTE_KIND:
9065 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9066 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9067 else
9068 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9069 break;
9070 case PyUnicode_2BYTE_KIND:
9071 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9072 break;
9073 case PyUnicode_4BYTE_KIND:
9074 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9075 break;
9076 default:
9077 assert(0); result = -2;
9078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 }
9080
9081 if (kind1 != kind)
9082 PyMem_Free(buf1);
9083 if (kind2 != kind)
9084 PyMem_Free(buf2);
9085
9086 return result;
9087}
9088
9089Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009090_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 Py_ssize_t n_buffer,
9092 void *digits, Py_ssize_t n_digits,
9093 Py_ssize_t min_width,
9094 const char *grouping,
9095 const char *thousands_sep)
9096{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009097 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009099 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9100 return _PyUnicode_ascii_InsertThousandsGrouping(
9101 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9102 min_width, grouping, thousands_sep);
9103 else
9104 return _PyUnicode_ucs1_InsertThousandsGrouping(
9105 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9106 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 case PyUnicode_2BYTE_KIND:
9108 return _PyUnicode_ucs2_InsertThousandsGrouping(
9109 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9110 min_width, grouping, thousands_sep);
9111 case PyUnicode_4BYTE_KIND:
9112 return _PyUnicode_ucs4_InsertThousandsGrouping(
9113 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9114 min_width, grouping, thousands_sep);
9115 }
9116 assert(0);
9117 return -1;
9118}
9119
9120
Thomas Wouters477c8d52006-05-27 19:21:47 +00009121/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009122#define ADJUST_INDICES(start, end, len) \
9123 if (end > len) \
9124 end = len; \
9125 else if (end < 0) { \
9126 end += len; \
9127 if (end < 0) \
9128 end = 0; \
9129 } \
9130 if (start < 0) { \
9131 start += len; \
9132 if (start < 0) \
9133 start = 0; \
9134 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009135
Alexander Belopolsky40018472011-02-26 01:02:56 +00009136Py_ssize_t
9137PyUnicode_Count(PyObject *str,
9138 PyObject *substr,
9139 Py_ssize_t start,
9140 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009142 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009143 PyObject* str_obj;
9144 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 int kind1, kind2, kind;
9146 void *buf1 = NULL, *buf2 = NULL;
9147 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009148
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009149 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009150 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009152 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009153 if (!sub_obj) {
9154 Py_DECREF(str_obj);
9155 return -1;
9156 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009157 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009158 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 Py_DECREF(str_obj);
9160 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161 }
Tim Petersced69f82003-09-16 20:30:58 +00009162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 kind1 = PyUnicode_KIND(str_obj);
9164 kind2 = PyUnicode_KIND(sub_obj);
9165 kind = kind1 > kind2 ? kind1 : kind2;
9166 buf1 = PyUnicode_DATA(str_obj);
9167 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009168 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 if (!buf1)
9170 goto onError;
9171 buf2 = PyUnicode_DATA(sub_obj);
9172 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009173 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 if (!buf2)
9175 goto onError;
9176 len1 = PyUnicode_GET_LENGTH(str_obj);
9177 len2 = PyUnicode_GET_LENGTH(sub_obj);
9178
9179 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009180 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009182 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9183 result = asciilib_count(
9184 ((Py_UCS1*)buf1) + start, end - start,
9185 buf2, len2, PY_SSIZE_T_MAX
9186 );
9187 else
9188 result = ucs1lib_count(
9189 ((Py_UCS1*)buf1) + start, end - start,
9190 buf2, len2, PY_SSIZE_T_MAX
9191 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 break;
9193 case PyUnicode_2BYTE_KIND:
9194 result = ucs2lib_count(
9195 ((Py_UCS2*)buf1) + start, end - start,
9196 buf2, len2, PY_SSIZE_T_MAX
9197 );
9198 break;
9199 case PyUnicode_4BYTE_KIND:
9200 result = ucs4lib_count(
9201 ((Py_UCS4*)buf1) + start, end - start,
9202 buf2, len2, PY_SSIZE_T_MAX
9203 );
9204 break;
9205 default:
9206 assert(0); result = 0;
9207 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009208
9209 Py_DECREF(sub_obj);
9210 Py_DECREF(str_obj);
9211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 if (kind1 != kind)
9213 PyMem_Free(buf1);
9214 if (kind2 != kind)
9215 PyMem_Free(buf2);
9216
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 onError:
9219 Py_DECREF(sub_obj);
9220 Py_DECREF(str_obj);
9221 if (kind1 != kind && buf1)
9222 PyMem_Free(buf1);
9223 if (kind2 != kind && buf2)
9224 PyMem_Free(buf2);
9225 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226}
9227
Alexander Belopolsky40018472011-02-26 01:02:56 +00009228Py_ssize_t
9229PyUnicode_Find(PyObject *str,
9230 PyObject *sub,
9231 Py_ssize_t start,
9232 Py_ssize_t end,
9233 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009235 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009236
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009238 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009239 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009240 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009241 if (!sub) {
9242 Py_DECREF(str);
9243 return -2;
9244 }
9245 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9246 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009247 Py_DECREF(str);
9248 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249 }
Tim Petersced69f82003-09-16 20:30:58 +00009250
Victor Stinner794d5672011-10-10 03:21:36 +02009251 result = any_find_slice(direction,
9252 str, sub, start, end
9253 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009254
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009256 Py_DECREF(sub);
9257
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 return result;
9259}
9260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261Py_ssize_t
9262PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9263 Py_ssize_t start, Py_ssize_t end,
9264 int direction)
9265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009267 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 if (PyUnicode_READY(str) == -1)
9269 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009270 if (start < 0 || end < 0) {
9271 PyErr_SetString(PyExc_IndexError, "string index out of range");
9272 return -2;
9273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 if (end > PyUnicode_GET_LENGTH(str))
9275 end = PyUnicode_GET_LENGTH(str);
9276 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009277 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9278 kind, end-start, ch, direction);
9279 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009281 else
9282 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283}
9284
Alexander Belopolsky40018472011-02-26 01:02:56 +00009285static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009286tailmatch(PyObject *self,
9287 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009288 Py_ssize_t start,
9289 Py_ssize_t end,
9290 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 int kind_self;
9293 int kind_sub;
9294 void *data_self;
9295 void *data_sub;
9296 Py_ssize_t offset;
9297 Py_ssize_t i;
9298 Py_ssize_t end_sub;
9299
9300 if (PyUnicode_READY(self) == -1 ||
9301 PyUnicode_READY(substring) == -1)
9302 return 0;
9303
9304 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 return 1;
9306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9308 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 kind_self = PyUnicode_KIND(self);
9313 data_self = PyUnicode_DATA(self);
9314 kind_sub = PyUnicode_KIND(substring);
9315 data_sub = PyUnicode_DATA(substring);
9316 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9317
9318 if (direction > 0)
9319 offset = end;
9320 else
9321 offset = start;
9322
9323 if (PyUnicode_READ(kind_self, data_self, offset) ==
9324 PyUnicode_READ(kind_sub, data_sub, 0) &&
9325 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9326 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9327 /* If both are of the same kind, memcmp is sufficient */
9328 if (kind_self == kind_sub) {
9329 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009330 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 data_sub,
9332 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009333 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 }
9335 /* otherwise we have to compare each character by first accesing it */
9336 else {
9337 /* We do not need to compare 0 and len(substring)-1 because
9338 the if statement above ensured already that they are equal
9339 when we end up here. */
9340 // TODO: honor direction and do a forward or backwards search
9341 for (i = 1; i < end_sub; ++i) {
9342 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9343 PyUnicode_READ(kind_sub, data_sub, i))
9344 return 0;
9345 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 }
9349
9350 return 0;
9351}
9352
Alexander Belopolsky40018472011-02-26 01:02:56 +00009353Py_ssize_t
9354PyUnicode_Tailmatch(PyObject *str,
9355 PyObject *substr,
9356 Py_ssize_t start,
9357 Py_ssize_t end,
9358 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009360 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 str = PyUnicode_FromObject(str);
9363 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 substr = PyUnicode_FromObject(substr);
9366 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 Py_DECREF(str);
9368 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
Tim Petersced69f82003-09-16 20:30:58 +00009370
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009371 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 Py_DECREF(str);
9374 Py_DECREF(substr);
9375 return result;
9376}
9377
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378/* Apply fixfct filter to the Unicode object self and return a
9379 reference to the modified object */
9380
Alexander Belopolsky40018472011-02-26 01:02:56 +00009381static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009382fixup(PyObject *self,
9383 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 PyObject *u;
9386 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009387 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009389 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009392 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 /* fix functions return the new maximum character in a string,
9395 if the kind of the resulting unicode object does not change,
9396 everything is fine. Otherwise we need to change the string kind
9397 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009398 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009399
9400 if (maxchar_new == 0) {
9401 /* no changes */;
9402 if (PyUnicode_CheckExact(self)) {
9403 Py_DECREF(u);
9404 Py_INCREF(self);
9405 return self;
9406 }
9407 else
9408 return u;
9409 }
9410
9411 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 maxchar_new = 127;
9413 else if (maxchar_new <= 255)
9414 maxchar_new = 255;
9415 else if (maxchar_new <= 65535)
9416 maxchar_new = 65535;
9417 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009418 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419
Victor Stinnereaab6042011-12-11 22:22:39 +01009420 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009422
9423 /* In case the maximum character changed, we need to
9424 convert the string to the new category. */
9425 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9426 if (v == NULL) {
9427 Py_DECREF(u);
9428 return NULL;
9429 }
9430 if (maxchar_new > maxchar_old) {
9431 /* If the maxchar increased so that the kind changed, not all
9432 characters are representable anymore and we need to fix the
9433 string again. This only happens in very few cases. */
9434 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9435 maxchar_old = fixfct(v);
9436 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 }
9438 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009439 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009441 Py_DECREF(u);
9442 assert(_PyUnicode_CheckConsistency(v, 1));
9443 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444}
9445
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009446static PyObject *
9447ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009449 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9450 char *resdata, *data = PyUnicode_DATA(self);
9451 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009452
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009453 res = PyUnicode_New(len, 127);
9454 if (res == NULL)
9455 return NULL;
9456 resdata = PyUnicode_DATA(res);
9457 if (lower)
9458 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009460 _Py_bytes_upper(resdata, data, len);
9461 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462}
9463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009465handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009467 Py_ssize_t j;
9468 int final_sigma;
9469 Py_UCS4 c;
9470 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009471
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009472 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9473
9474 where ! is a negation and \p{xxx} is a character with property xxx.
9475 */
9476 for (j = i - 1; j >= 0; j--) {
9477 c = PyUnicode_READ(kind, data, j);
9478 if (!_PyUnicode_IsCaseIgnorable(c))
9479 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009481 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9482 if (final_sigma) {
9483 for (j = i + 1; j < length; j++) {
9484 c = PyUnicode_READ(kind, data, j);
9485 if (!_PyUnicode_IsCaseIgnorable(c))
9486 break;
9487 }
9488 final_sigma = j == length || !_PyUnicode_IsCased(c);
9489 }
9490 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491}
9492
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009493static int
9494lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9495 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009497 /* Obscure special case. */
9498 if (c == 0x3A3) {
9499 mapped[0] = handle_capital_sigma(kind, data, length, i);
9500 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009502 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503}
9504
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505static Py_ssize_t
9506do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508 Py_ssize_t i, k = 0;
9509 int n_res, j;
9510 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009511
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009512 c = PyUnicode_READ(kind, data, 0);
9513 n_res = _PyUnicode_ToUpperFull(c, mapped);
9514 for (j = 0; j < n_res; j++) {
9515 if (mapped[j] > *maxchar)
9516 *maxchar = mapped[j];
9517 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009519 for (i = 1; i < length; i++) {
9520 c = PyUnicode_READ(kind, data, i);
9521 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9522 for (j = 0; j < n_res; j++) {
9523 if (mapped[j] > *maxchar)
9524 *maxchar = mapped[j];
9525 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009526 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009527 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009528 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529}
9530
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009531static Py_ssize_t
9532do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9533 Py_ssize_t i, k = 0;
9534
9535 for (i = 0; i < length; i++) {
9536 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9537 int n_res, j;
9538 if (Py_UNICODE_ISUPPER(c)) {
9539 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9540 }
9541 else if (Py_UNICODE_ISLOWER(c)) {
9542 n_res = _PyUnicode_ToUpperFull(c, mapped);
9543 }
9544 else {
9545 n_res = 1;
9546 mapped[0] = c;
9547 }
9548 for (j = 0; j < n_res; j++) {
9549 if (mapped[j] > *maxchar)
9550 *maxchar = mapped[j];
9551 res[k++] = mapped[j];
9552 }
9553 }
9554 return k;
9555}
9556
9557static Py_ssize_t
9558do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9559 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009561 Py_ssize_t i, k = 0;
9562
9563 for (i = 0; i < length; i++) {
9564 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9565 int n_res, j;
9566 if (lower)
9567 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9568 else
9569 n_res = _PyUnicode_ToUpperFull(c, mapped);
9570 for (j = 0; j < n_res; j++) {
9571 if (mapped[j] > *maxchar)
9572 *maxchar = mapped[j];
9573 res[k++] = mapped[j];
9574 }
9575 }
9576 return k;
9577}
9578
9579static Py_ssize_t
9580do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9581{
9582 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9583}
9584
9585static Py_ssize_t
9586do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9587{
9588 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9589}
9590
Benjamin Petersone51757f2012-01-12 21:10:29 -05009591static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009592do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9593{
9594 Py_ssize_t i, k = 0;
9595
9596 for (i = 0; i < length; i++) {
9597 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9598 Py_UCS4 mapped[3];
9599 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9600 for (j = 0; j < n_res; j++) {
9601 if (mapped[j] > *maxchar)
9602 *maxchar = mapped[j];
9603 res[k++] = mapped[j];
9604 }
9605 }
9606 return k;
9607}
9608
9609static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009610do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9611{
9612 Py_ssize_t i, k = 0;
9613 int previous_is_cased;
9614
9615 previous_is_cased = 0;
9616 for (i = 0; i < length; i++) {
9617 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9618 Py_UCS4 mapped[3];
9619 int n_res, j;
9620
9621 if (previous_is_cased)
9622 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9623 else
9624 n_res = _PyUnicode_ToTitleFull(c, mapped);
9625
9626 for (j = 0; j < n_res; j++) {
9627 if (mapped[j] > *maxchar)
9628 *maxchar = mapped[j];
9629 res[k++] = mapped[j];
9630 }
9631
9632 previous_is_cased = _PyUnicode_IsCased(c);
9633 }
9634 return k;
9635}
9636
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637static PyObject *
9638case_operation(PyObject *self,
9639 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9640{
9641 PyObject *res = NULL;
9642 Py_ssize_t length, newlength = 0;
9643 int kind, outkind;
9644 void *data, *outdata;
9645 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9646
Benjamin Petersoneea48462012-01-16 14:28:50 -05009647 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648
9649 kind = PyUnicode_KIND(self);
9650 data = PyUnicode_DATA(self);
9651 length = PyUnicode_GET_LENGTH(self);
9652 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9653 if (tmp == NULL)
9654 return PyErr_NoMemory();
9655 newlength = perform(kind, data, length, tmp, &maxchar);
9656 res = PyUnicode_New(newlength, maxchar);
9657 if (res == NULL)
9658 goto leave;
9659 tmpend = tmp + newlength;
9660 outdata = PyUnicode_DATA(res);
9661 outkind = PyUnicode_KIND(res);
9662 switch (outkind) {
9663 case PyUnicode_1BYTE_KIND:
9664 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9665 break;
9666 case PyUnicode_2BYTE_KIND:
9667 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9668 break;
9669 case PyUnicode_4BYTE_KIND:
9670 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9671 break;
9672 default:
9673 assert(0);
9674 break;
9675 }
9676 leave:
9677 PyMem_FREE(tmp);
9678 return res;
9679}
9680
Tim Peters8ce9f162004-08-27 01:49:32 +00009681PyObject *
9682PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009685 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009687 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009688 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9689 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009690 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009692 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009694 int use_memcpy;
9695 unsigned char *res_data = NULL, *sep_data = NULL;
9696 PyObject *last_obj;
9697 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698
Tim Peters05eba1f2004-08-27 21:32:02 +00009699 fseq = PySequence_Fast(seq, "");
9700 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009701 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009702 }
9703
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009704 /* NOTE: the following code can't call back into Python code,
9705 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009706 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009707
Tim Peters05eba1f2004-08-27 21:32:02 +00009708 seqlen = PySequence_Fast_GET_SIZE(fseq);
9709 /* If empty sequence, return u"". */
9710 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009711 Py_DECREF(fseq);
9712 Py_INCREF(unicode_empty);
9713 res = unicode_empty;
9714 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009715 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009716
Tim Peters05eba1f2004-08-27 21:32:02 +00009717 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009718 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009719 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009720 if (seqlen == 1) {
9721 if (PyUnicode_CheckExact(items[0])) {
9722 res = items[0];
9723 Py_INCREF(res);
9724 Py_DECREF(fseq);
9725 return res;
9726 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009727 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009728 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009729 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009730 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009731 /* Set up sep and seplen */
9732 if (separator == NULL) {
9733 /* fall back to a blank space separator */
9734 sep = PyUnicode_FromOrdinal(' ');
9735 if (!sep)
9736 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009737 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009738 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009739 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009740 else {
9741 if (!PyUnicode_Check(separator)) {
9742 PyErr_Format(PyExc_TypeError,
9743 "separator: expected str instance,"
9744 " %.80s found",
9745 Py_TYPE(separator)->tp_name);
9746 goto onError;
9747 }
9748 if (PyUnicode_READY(separator))
9749 goto onError;
9750 sep = separator;
9751 seplen = PyUnicode_GET_LENGTH(separator);
9752 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9753 /* inc refcount to keep this code path symmetric with the
9754 above case of a blank separator */
9755 Py_INCREF(sep);
9756 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009757 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009758 }
9759
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009760 /* There are at least two things to join, or else we have a subclass
9761 * of str in the sequence.
9762 * Do a pre-pass to figure out the total amount of space we'll
9763 * need (sz), and see whether all argument are strings.
9764 */
9765 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009766#ifdef Py_DEBUG
9767 use_memcpy = 0;
9768#else
9769 use_memcpy = 1;
9770#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009771 for (i = 0; i < seqlen; i++) {
9772 const Py_ssize_t old_sz = sz;
9773 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009774 if (!PyUnicode_Check(item)) {
9775 PyErr_Format(PyExc_TypeError,
9776 "sequence item %zd: expected str instance,"
9777 " %.80s found",
9778 i, Py_TYPE(item)->tp_name);
9779 goto onError;
9780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 if (PyUnicode_READY(item) == -1)
9782 goto onError;
9783 sz += PyUnicode_GET_LENGTH(item);
9784 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009785 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009786 if (i != 0)
9787 sz += seplen;
9788 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9789 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009790 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009791 goto onError;
9792 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009793 if (use_memcpy && last_obj != NULL) {
9794 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9795 use_memcpy = 0;
9796 }
9797 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009798 }
Tim Petersced69f82003-09-16 20:30:58 +00009799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009801 if (res == NULL)
9802 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009803
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009804 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009805#ifdef Py_DEBUG
9806 use_memcpy = 0;
9807#else
9808 if (use_memcpy) {
9809 res_data = PyUnicode_1BYTE_DATA(res);
9810 kind = PyUnicode_KIND(res);
9811 if (seplen != 0)
9812 sep_data = PyUnicode_1BYTE_DATA(sep);
9813 }
9814#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009816 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009817 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009818 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009819 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009820 if (use_memcpy) {
9821 Py_MEMCPY(res_data,
9822 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009823 kind * seplen);
9824 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009825 }
9826 else {
9827 copy_characters(res, res_offset, sep, 0, seplen);
9828 res_offset += seplen;
9829 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009830 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009831 itemlen = PyUnicode_GET_LENGTH(item);
9832 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009833 if (use_memcpy) {
9834 Py_MEMCPY(res_data,
9835 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009836 kind * itemlen);
9837 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009838 }
9839 else {
9840 copy_characters(res, res_offset, item, 0, itemlen);
9841 res_offset += itemlen;
9842 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009843 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009844 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009845 if (use_memcpy)
9846 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009847 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009848 else
9849 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009850
Tim Peters05eba1f2004-08-27 21:32:02 +00009851 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009853 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855
Benjamin Peterson29060642009-01-31 22:14:21 +00009856 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009857 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009859 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860 return NULL;
9861}
9862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863#define FILL(kind, data, value, start, length) \
9864 do { \
9865 Py_ssize_t i_ = 0; \
9866 assert(kind != PyUnicode_WCHAR_KIND); \
9867 switch ((kind)) { \
9868 case PyUnicode_1BYTE_KIND: { \
9869 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9870 memset(to_, (unsigned char)value, length); \
9871 break; \
9872 } \
9873 case PyUnicode_2BYTE_KIND: { \
9874 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9875 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9876 break; \
9877 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009878 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9880 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9881 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009882 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 } \
9884 } \
9885 } while (0)
9886
Victor Stinner3fe55312012-01-04 00:33:50 +01009887Py_ssize_t
9888PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9889 Py_UCS4 fill_char)
9890{
9891 Py_ssize_t maxlen;
9892 enum PyUnicode_Kind kind;
9893 void *data;
9894
9895 if (!PyUnicode_Check(unicode)) {
9896 PyErr_BadInternalCall();
9897 return -1;
9898 }
9899 if (PyUnicode_READY(unicode) == -1)
9900 return -1;
9901 if (unicode_check_modifiable(unicode))
9902 return -1;
9903
9904 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9905 PyErr_SetString(PyExc_ValueError,
9906 "fill character is bigger than "
9907 "the string maximum character");
9908 return -1;
9909 }
9910
9911 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9912 length = Py_MIN(maxlen, length);
9913 if (length <= 0)
9914 return 0;
9915
9916 kind = PyUnicode_KIND(unicode);
9917 data = PyUnicode_DATA(unicode);
9918 FILL(kind, data, fill_char, start, length);
9919 return length;
9920}
9921
Victor Stinner9310abb2011-10-05 00:59:23 +02009922static PyObject *
9923pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009924 Py_ssize_t left,
9925 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 PyObject *u;
9929 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009930 int kind;
9931 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932
9933 if (left < 0)
9934 left = 0;
9935 if (right < 0)
9936 right = 0;
9937
Victor Stinnerc4b49542011-12-11 22:44:26 +01009938 if (left == 0 && right == 0)
9939 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9942 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009943 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9944 return NULL;
9945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9947 if (fill > maxchar)
9948 maxchar = fill;
9949 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009950 if (!u)
9951 return NULL;
9952
9953 kind = PyUnicode_KIND(u);
9954 data = PyUnicode_DATA(u);
9955 if (left)
9956 FILL(kind, data, fill, 0, left);
9957 if (right)
9958 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009959 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009960 assert(_PyUnicode_CheckConsistency(u, 1));
9961 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964
Alexander Belopolsky40018472011-02-26 01:02:56 +00009965PyObject *
9966PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969
9970 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009971 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009973 if (PyUnicode_READY(string) == -1) {
9974 Py_DECREF(string);
9975 return NULL;
9976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977
Benjamin Petersonead6b532011-12-20 17:23:42 -06009978 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009980 if (PyUnicode_IS_ASCII(string))
9981 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009982 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009983 PyUnicode_GET_LENGTH(string), keepends);
9984 else
9985 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009986 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009987 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 break;
9989 case PyUnicode_2BYTE_KIND:
9990 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009991 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 PyUnicode_GET_LENGTH(string), keepends);
9993 break;
9994 case PyUnicode_4BYTE_KIND:
9995 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009996 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 PyUnicode_GET_LENGTH(string), keepends);
9998 break;
9999 default:
10000 assert(0);
10001 list = 0;
10002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003 Py_DECREF(string);
10004 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005}
10006
Alexander Belopolsky40018472011-02-26 01:02:56 +000010007static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010008split(PyObject *self,
10009 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010010 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 int kind1, kind2, kind;
10013 void *buf1, *buf2;
10014 Py_ssize_t len1, len2;
10015 PyObject* out;
10016
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010018 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 if (PyUnicode_READY(self) == -1)
10021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010024 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010026 if (PyUnicode_IS_ASCII(self))
10027 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010028 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010029 PyUnicode_GET_LENGTH(self), maxcount
10030 );
10031 else
10032 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010033 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010034 PyUnicode_GET_LENGTH(self), maxcount
10035 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 case PyUnicode_2BYTE_KIND:
10037 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010038 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 PyUnicode_GET_LENGTH(self), maxcount
10040 );
10041 case PyUnicode_4BYTE_KIND:
10042 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010043 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 PyUnicode_GET_LENGTH(self), maxcount
10045 );
10046 default:
10047 assert(0);
10048 return NULL;
10049 }
10050
10051 if (PyUnicode_READY(substring) == -1)
10052 return NULL;
10053
10054 kind1 = PyUnicode_KIND(self);
10055 kind2 = PyUnicode_KIND(substring);
10056 kind = kind1 > kind2 ? kind1 : kind2;
10057 buf1 = PyUnicode_DATA(self);
10058 buf2 = PyUnicode_DATA(substring);
10059 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010060 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (!buf1)
10062 return NULL;
10063 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010064 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 if (!buf2) {
10066 if (kind1 != kind) PyMem_Free(buf1);
10067 return NULL;
10068 }
10069 len1 = PyUnicode_GET_LENGTH(self);
10070 len2 = PyUnicode_GET_LENGTH(substring);
10071
Benjamin Petersonead6b532011-12-20 17:23:42 -060010072 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010074 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10075 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010076 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010077 else
10078 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010079 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 break;
10081 case PyUnicode_2BYTE_KIND:
10082 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010083 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 break;
10085 case PyUnicode_4BYTE_KIND:
10086 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010087 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 break;
10089 default:
10090 out = NULL;
10091 }
10092 if (kind1 != kind)
10093 PyMem_Free(buf1);
10094 if (kind2 != kind)
10095 PyMem_Free(buf2);
10096 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097}
10098
Alexander Belopolsky40018472011-02-26 01:02:56 +000010099static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010100rsplit(PyObject *self,
10101 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010102 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 int kind1, kind2, kind;
10105 void *buf1, *buf2;
10106 Py_ssize_t len1, len2;
10107 PyObject* out;
10108
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010109 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010110 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (PyUnicode_READY(self) == -1)
10113 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010116 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010118 if (PyUnicode_IS_ASCII(self))
10119 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010120 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010121 PyUnicode_GET_LENGTH(self), maxcount
10122 );
10123 else
10124 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010125 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010126 PyUnicode_GET_LENGTH(self), maxcount
10127 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 case PyUnicode_2BYTE_KIND:
10129 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010130 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 PyUnicode_GET_LENGTH(self), maxcount
10132 );
10133 case PyUnicode_4BYTE_KIND:
10134 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010135 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 PyUnicode_GET_LENGTH(self), maxcount
10137 );
10138 default:
10139 assert(0);
10140 return NULL;
10141 }
10142
10143 if (PyUnicode_READY(substring) == -1)
10144 return NULL;
10145
10146 kind1 = PyUnicode_KIND(self);
10147 kind2 = PyUnicode_KIND(substring);
10148 kind = kind1 > kind2 ? kind1 : kind2;
10149 buf1 = PyUnicode_DATA(self);
10150 buf2 = PyUnicode_DATA(substring);
10151 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010152 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 if (!buf1)
10154 return NULL;
10155 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010156 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 if (!buf2) {
10158 if (kind1 != kind) PyMem_Free(buf1);
10159 return NULL;
10160 }
10161 len1 = PyUnicode_GET_LENGTH(self);
10162 len2 = PyUnicode_GET_LENGTH(substring);
10163
Benjamin Petersonead6b532011-12-20 17:23:42 -060010164 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010166 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10167 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010168 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010169 else
10170 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010171 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 break;
10173 case PyUnicode_2BYTE_KIND:
10174 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010175 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 break;
10177 case PyUnicode_4BYTE_KIND:
10178 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010179 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 break;
10181 default:
10182 out = NULL;
10183 }
10184 if (kind1 != kind)
10185 PyMem_Free(buf1);
10186 if (kind2 != kind)
10187 PyMem_Free(buf2);
10188 return out;
10189}
10190
10191static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10193 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010195 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010197 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10198 return asciilib_find(buf1, len1, buf2, len2, offset);
10199 else
10200 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 case PyUnicode_2BYTE_KIND:
10202 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10203 case PyUnicode_4BYTE_KIND:
10204 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10205 }
10206 assert(0);
10207 return -1;
10208}
10209
10210static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010211anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10212 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010214 switch (kind) {
10215 case PyUnicode_1BYTE_KIND:
10216 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10217 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10218 else
10219 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10220 case PyUnicode_2BYTE_KIND:
10221 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10222 case PyUnicode_4BYTE_KIND:
10223 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10224 }
10225 assert(0);
10226 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010227}
10228
Alexander Belopolsky40018472011-02-26 01:02:56 +000010229static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230replace(PyObject *self, PyObject *str1,
10231 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 PyObject *u;
10234 char *sbuf = PyUnicode_DATA(self);
10235 char *buf1 = PyUnicode_DATA(str1);
10236 char *buf2 = PyUnicode_DATA(str2);
10237 int srelease = 0, release1 = 0, release2 = 0;
10238 int skind = PyUnicode_KIND(self);
10239 int kind1 = PyUnicode_KIND(str1);
10240 int kind2 = PyUnicode_KIND(str2);
10241 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10242 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10243 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010244 int mayshrink;
10245 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246
10247 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010248 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010250 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Victor Stinner59de0ee2011-10-07 10:01:28 +020010252 if (str1 == str2)
10253 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 if (skind < kind1)
10255 /* substring too wide to be present */
10256 goto nothing;
10257
Victor Stinner49a0a212011-10-12 23:46:10 +020010258 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10259 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10260 /* Replacing str1 with str2 may cause a maxchar reduction in the
10261 result string. */
10262 mayshrink = (maxchar_str2 < maxchar);
10263 maxchar = Py_MAX(maxchar, maxchar_str2);
10264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010266 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010268 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010270 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010271 Py_UCS4 u1, u2;
10272 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010273 Py_ssize_t index, pos;
10274 char *src;
10275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010277 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10278 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010282 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010284 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010286
10287 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10288 index = 0;
10289 src = sbuf;
10290 while (--maxcount)
10291 {
10292 pos++;
10293 src += pos * PyUnicode_KIND(self);
10294 slen -= pos;
10295 index += pos;
10296 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10297 if (pos < 0)
10298 break;
10299 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10300 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010301 }
10302 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 int rkind = skind;
10304 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010305 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 if (kind1 < rkind) {
10308 /* widen substring */
10309 buf1 = _PyUnicode_AsKind(str1, rkind);
10310 if (!buf1) goto error;
10311 release1 = 1;
10312 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010313 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010314 if (i < 0)
10315 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 if (rkind > kind2) {
10317 /* widen replacement */
10318 buf2 = _PyUnicode_AsKind(str2, rkind);
10319 if (!buf2) goto error;
10320 release2 = 1;
10321 }
10322 else if (rkind < kind2) {
10323 /* widen self and buf1 */
10324 rkind = kind2;
10325 if (release1) PyMem_Free(buf1);
10326 sbuf = _PyUnicode_AsKind(self, rkind);
10327 if (!sbuf) goto error;
10328 srelease = 1;
10329 buf1 = _PyUnicode_AsKind(str1, rkind);
10330 if (!buf1) goto error;
10331 release1 = 1;
10332 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010333 u = PyUnicode_New(slen, maxchar);
10334 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010336 assert(PyUnicode_KIND(u) == rkind);
10337 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010338
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010339 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010340 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010341 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010343 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010345
10346 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010348 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010350 if (i == -1)
10351 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010352 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010358 }
10359 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 Py_ssize_t n, i, j, ires;
10361 Py_ssize_t product, new_size;
10362 int rkind = skind;
10363 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010366 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 buf1 = _PyUnicode_AsKind(str1, rkind);
10368 if (!buf1) goto error;
10369 release1 = 1;
10370 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010371 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010372 if (n == 0)
10373 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010375 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 buf2 = _PyUnicode_AsKind(str2, rkind);
10377 if (!buf2) goto error;
10378 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010381 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 rkind = kind2;
10383 sbuf = _PyUnicode_AsKind(self, rkind);
10384 if (!sbuf) goto error;
10385 srelease = 1;
10386 if (release1) PyMem_Free(buf1);
10387 buf1 = _PyUnicode_AsKind(str1, rkind);
10388 if (!buf1) goto error;
10389 release1 = 1;
10390 }
10391 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10392 PyUnicode_GET_LENGTH(str1))); */
10393 product = n * (len2-len1);
10394 if ((product / (len2-len1)) != n) {
10395 PyErr_SetString(PyExc_OverflowError,
10396 "replace string is too long");
10397 goto error;
10398 }
10399 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010400 if (new_size == 0) {
10401 Py_INCREF(unicode_empty);
10402 u = unicode_empty;
10403 goto done;
10404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10406 PyErr_SetString(PyExc_OverflowError,
10407 "replace string is too long");
10408 goto error;
10409 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010410 u = PyUnicode_New(new_size, maxchar);
10411 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010413 assert(PyUnicode_KIND(u) == rkind);
10414 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 ires = i = 0;
10416 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010417 while (n-- > 0) {
10418 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010419 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010420 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010422 if (j == -1)
10423 break;
10424 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010425 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010426 memcpy(res + rkind * ires,
10427 sbuf + rkind * i,
10428 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430 }
10431 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010433 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010435 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010441 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010442 memcpy(res + rkind * ires,
10443 sbuf + rkind * i,
10444 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010445 }
10446 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010447 /* interleave */
10448 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010449 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010451 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010453 if (--n <= 0)
10454 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010455 memcpy(res + rkind * ires,
10456 sbuf + rkind * i,
10457 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 ires++;
10459 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010460 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010461 memcpy(res + rkind * ires,
10462 sbuf + rkind * i,
10463 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010464 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010465 }
10466
10467 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010468 unicode_adjust_maxchar(&u);
10469 if (u == NULL)
10470 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010472
10473 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (srelease)
10475 PyMem_FREE(sbuf);
10476 if (release1)
10477 PyMem_FREE(buf1);
10478 if (release2)
10479 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010480 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010482
Benjamin Peterson29060642009-01-31 22:14:21 +000010483 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 if (srelease)
10486 PyMem_FREE(sbuf);
10487 if (release1)
10488 PyMem_FREE(buf1);
10489 if (release2)
10490 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010491 return unicode_result_unchanged(self);
10492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 error:
10494 if (srelease && sbuf)
10495 PyMem_FREE(sbuf);
10496 if (release1 && buf1)
10497 PyMem_FREE(buf1);
10498 if (release2 && buf2)
10499 PyMem_FREE(buf2);
10500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501}
10502
10503/* --- Unicode Object Methods --------------------------------------------- */
10504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010505PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010506 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507\n\
10508Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010509characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010510
10511static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010512unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010514 if (PyUnicode_READY(self) == -1)
10515 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010516 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517}
10518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010519PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521\n\
10522Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010523have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524
10525static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010526unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010528 if (PyUnicode_READY(self) == -1)
10529 return NULL;
10530 if (PyUnicode_GET_LENGTH(self) == 0)
10531 return unicode_result_unchanged(self);
10532 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533}
10534
Benjamin Petersond5890c82012-01-14 13:23:30 -050010535PyDoc_STRVAR(casefold__doc__,
10536 "S.casefold() -> str\n\
10537\n\
10538Return a version of S suitable for caseless comparisons.");
10539
10540static PyObject *
10541unicode_casefold(PyObject *self)
10542{
10543 if (PyUnicode_READY(self) == -1)
10544 return NULL;
10545 if (PyUnicode_IS_ASCII(self))
10546 return ascii_upper_or_lower(self, 1);
10547 return case_operation(self, do_casefold);
10548}
10549
10550
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010551/* Argument converter. Coerces to a single unicode character */
10552
10553static int
10554convert_uc(PyObject *obj, void *addr)
10555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010557 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010558
Benjamin Peterson14339b62009-01-31 16:36:08 +000010559 uniobj = PyUnicode_FromObject(obj);
10560 if (uniobj == NULL) {
10561 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010562 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010563 return 0;
10564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010566 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010567 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010568 Py_DECREF(uniobj);
10569 return 0;
10570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010572 Py_DECREF(uniobj);
10573 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010574}
10575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010576PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010577 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010579Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010580done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581
10582static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010583unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010585 Py_ssize_t marg, left;
10586 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 Py_UCS4 fillchar = ' ';
10588
Victor Stinnere9a29352011-10-01 02:14:59 +020010589 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591
Benjamin Petersonbac79492012-01-14 13:34:47 -050010592 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593 return NULL;
10594
Victor Stinnerc4b49542011-12-11 22:44:26 +010010595 if (PyUnicode_GET_LENGTH(self) >= width)
10596 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597
Victor Stinnerc4b49542011-12-11 22:44:26 +010010598 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 left = marg / 2 + (marg & width & 1);
10600
Victor Stinner9310abb2011-10-05 00:59:23 +020010601 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602}
10603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604/* This function assumes that str1 and str2 are readied by the caller. */
10605
Marc-André Lemburge5034372000-08-08 08:04:29 +000010606static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010607unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 int kind1, kind2;
10610 void *data1, *data2;
10611 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 kind1 = PyUnicode_KIND(str1);
10614 kind2 = PyUnicode_KIND(str2);
10615 data1 = PyUnicode_DATA(str1);
10616 data2 = PyUnicode_DATA(str2);
10617 len1 = PyUnicode_GET_LENGTH(str1);
10618 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 for (i = 0; i < len1 && i < len2; ++i) {
10621 Py_UCS4 c1, c2;
10622 c1 = PyUnicode_READ(kind1, data1, i);
10623 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010624
10625 if (c1 != c2)
10626 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010627 }
10628
10629 return (len1 < len2) ? -1 : (len1 != len2);
10630}
10631
Alexander Belopolsky40018472011-02-26 01:02:56 +000010632int
10633PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10636 if (PyUnicode_READY(left) == -1 ||
10637 PyUnicode_READY(right) == -1)
10638 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010639 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010641 PyErr_Format(PyExc_TypeError,
10642 "Can't compare %.100s and %.100s",
10643 left->ob_type->tp_name,
10644 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 return -1;
10646}
10647
Martin v. Löwis5b222132007-06-10 09:51:05 +000010648int
10649PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 Py_ssize_t i;
10652 int kind;
10653 void *data;
10654 Py_UCS4 chr;
10655
Victor Stinner910337b2011-10-03 03:20:16 +020010656 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (PyUnicode_READY(uni) == -1)
10658 return -1;
10659 kind = PyUnicode_KIND(uni);
10660 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010661 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10663 if (chr != str[i])
10664 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010665 /* This check keeps Python strings that end in '\0' from comparing equal
10666 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010668 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010669 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010671 return 0;
10672}
10673
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010674
Benjamin Peterson29060642009-01-31 22:14:21 +000010675#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010676 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010677
Alexander Belopolsky40018472011-02-26 01:02:56 +000010678PyObject *
10679PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010680{
10681 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010682
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010683 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10684 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (PyUnicode_READY(left) == -1 ||
10686 PyUnicode_READY(right) == -1)
10687 return NULL;
10688 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10689 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010690 if (op == Py_EQ) {
10691 Py_INCREF(Py_False);
10692 return Py_False;
10693 }
10694 if (op == Py_NE) {
10695 Py_INCREF(Py_True);
10696 return Py_True;
10697 }
10698 }
10699 if (left == right)
10700 result = 0;
10701 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010702 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010703
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010704 /* Convert the return value to a Boolean */
10705 switch (op) {
10706 case Py_EQ:
10707 v = TEST_COND(result == 0);
10708 break;
10709 case Py_NE:
10710 v = TEST_COND(result != 0);
10711 break;
10712 case Py_LE:
10713 v = TEST_COND(result <= 0);
10714 break;
10715 case Py_GE:
10716 v = TEST_COND(result >= 0);
10717 break;
10718 case Py_LT:
10719 v = TEST_COND(result == -1);
10720 break;
10721 case Py_GT:
10722 v = TEST_COND(result == 1);
10723 break;
10724 default:
10725 PyErr_BadArgument();
10726 return NULL;
10727 }
10728 Py_INCREF(v);
10729 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010730 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010731
Brian Curtindfc80e32011-08-10 20:28:54 -050010732 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010733}
10734
Alexander Belopolsky40018472011-02-26 01:02:56 +000010735int
10736PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010737{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010738 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 int kind1, kind2, kind;
10740 void *buf1, *buf2;
10741 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010742 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010743
10744 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010745 sub = PyUnicode_FromObject(element);
10746 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 PyErr_Format(PyExc_TypeError,
10748 "'in <string>' requires string as left operand, not %s",
10749 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010750 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010751 }
10752
Thomas Wouters477c8d52006-05-27 19:21:47 +000010753 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010754 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010755 Py_DECREF(sub);
10756 return -1;
10757 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010758 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10759 Py_DECREF(sub);
10760 Py_DECREF(str);
10761 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 kind1 = PyUnicode_KIND(str);
10764 kind2 = PyUnicode_KIND(sub);
10765 kind = kind1 > kind2 ? kind1 : kind2;
10766 buf1 = PyUnicode_DATA(str);
10767 buf2 = PyUnicode_DATA(sub);
10768 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010769 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 if (!buf1) {
10771 Py_DECREF(sub);
10772 return -1;
10773 }
10774 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010775 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 if (!buf2) {
10777 Py_DECREF(sub);
10778 if (kind1 != kind) PyMem_Free(buf1);
10779 return -1;
10780 }
10781 len1 = PyUnicode_GET_LENGTH(str);
10782 len2 = PyUnicode_GET_LENGTH(sub);
10783
Benjamin Petersonead6b532011-12-20 17:23:42 -060010784 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 case PyUnicode_1BYTE_KIND:
10786 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10787 break;
10788 case PyUnicode_2BYTE_KIND:
10789 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10790 break;
10791 case PyUnicode_4BYTE_KIND:
10792 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10793 break;
10794 default:
10795 result = -1;
10796 assert(0);
10797 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010798
10799 Py_DECREF(str);
10800 Py_DECREF(sub);
10801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 if (kind1 != kind)
10803 PyMem_Free(buf1);
10804 if (kind2 != kind)
10805 PyMem_Free(buf2);
10806
Guido van Rossum403d68b2000-03-13 15:55:09 +000010807 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010808}
10809
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810/* Concat to string or Unicode object giving a new Unicode object. */
10811
Alexander Belopolsky40018472011-02-26 01:02:56 +000010812PyObject *
10813PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010816 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010817 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818
10819 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010822 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010825 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
10827 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010828 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010829 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010832 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010833 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835 }
10836
Victor Stinner488fa492011-12-12 00:01:39 +010010837 u_len = PyUnicode_GET_LENGTH(u);
10838 v_len = PyUnicode_GET_LENGTH(v);
10839 if (u_len > PY_SSIZE_T_MAX - v_len) {
10840 PyErr_SetString(PyExc_OverflowError,
10841 "strings are too large to concat");
10842 goto onError;
10843 }
10844 new_len = u_len + v_len;
10845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010847 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10848 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010851 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010853 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010854 copy_characters(w, 0, u, 0, u_len);
10855 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856 Py_DECREF(u);
10857 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010858 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862 Py_XDECREF(u);
10863 Py_XDECREF(v);
10864 return NULL;
10865}
10866
Walter Dörwald1ab83302007-05-18 17:15:44 +000010867void
Victor Stinner23e56682011-10-03 03:54:37 +020010868PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010869{
Victor Stinner23e56682011-10-03 03:54:37 +020010870 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010871 Py_UCS4 maxchar, maxchar2;
10872 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010873
10874 if (p_left == NULL) {
10875 if (!PyErr_Occurred())
10876 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010877 return;
10878 }
Victor Stinner23e56682011-10-03 03:54:37 +020010879 left = *p_left;
10880 if (right == NULL || !PyUnicode_Check(left)) {
10881 if (!PyErr_Occurred())
10882 PyErr_BadInternalCall();
10883 goto error;
10884 }
10885
Benjamin Petersonbac79492012-01-14 13:34:47 -050010886 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010887 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010888 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010889 goto error;
10890
Victor Stinner488fa492011-12-12 00:01:39 +010010891 /* Shortcuts */
10892 if (left == unicode_empty) {
10893 Py_DECREF(left);
10894 Py_INCREF(right);
10895 *p_left = right;
10896 return;
10897 }
10898 if (right == unicode_empty)
10899 return;
10900
10901 left_len = PyUnicode_GET_LENGTH(left);
10902 right_len = PyUnicode_GET_LENGTH(right);
10903 if (left_len > PY_SSIZE_T_MAX - right_len) {
10904 PyErr_SetString(PyExc_OverflowError,
10905 "strings are too large to concat");
10906 goto error;
10907 }
10908 new_len = left_len + right_len;
10909
10910 if (unicode_modifiable(left)
10911 && PyUnicode_CheckExact(right)
10912 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010913 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10914 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010915 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010916 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010917 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10918 {
10919 /* append inplace */
10920 if (unicode_resize(p_left, new_len) != 0) {
10921 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10922 * deallocated so it cannot be put back into
10923 * 'variable'. The MemoryError is raised when there
10924 * is no value in 'variable', which might (very
10925 * remotely) be a cause of incompatibilities.
10926 */
10927 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010928 }
Victor Stinner488fa492011-12-12 00:01:39 +010010929 /* copy 'right' into the newly allocated area of 'left' */
10930 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010931 }
Victor Stinner488fa492011-12-12 00:01:39 +010010932 else {
10933 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10934 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10935 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010936
Victor Stinner488fa492011-12-12 00:01:39 +010010937 /* Concat the two Unicode strings */
10938 res = PyUnicode_New(new_len, maxchar);
10939 if (res == NULL)
10940 goto error;
10941 copy_characters(res, 0, left, 0, left_len);
10942 copy_characters(res, left_len, right, 0, right_len);
10943 Py_DECREF(left);
10944 *p_left = res;
10945 }
10946 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010947 return;
10948
10949error:
Victor Stinner488fa492011-12-12 00:01:39 +010010950 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010951}
10952
10953void
10954PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10955{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010956 PyUnicode_Append(pleft, right);
10957 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010958}
10959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010960PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010961 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010963Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010964string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010965interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
10967static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010968unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010970 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010971 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010972 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 int kind1, kind2, kind;
10975 void *buf1, *buf2;
10976 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
Jesus Ceaac451502011-04-20 17:09:23 +020010978 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10979 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010980 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 kind1 = PyUnicode_KIND(self);
10983 kind2 = PyUnicode_KIND(substring);
10984 kind = kind1 > kind2 ? kind1 : kind2;
10985 buf1 = PyUnicode_DATA(self);
10986 buf2 = PyUnicode_DATA(substring);
10987 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010988 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 if (!buf1) {
10990 Py_DECREF(substring);
10991 return NULL;
10992 }
10993 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010994 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 if (!buf2) {
10996 Py_DECREF(substring);
10997 if (kind1 != kind) PyMem_Free(buf1);
10998 return NULL;
10999 }
11000 len1 = PyUnicode_GET_LENGTH(self);
11001 len2 = PyUnicode_GET_LENGTH(substring);
11002
11003 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011004 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 case PyUnicode_1BYTE_KIND:
11006 iresult = ucs1lib_count(
11007 ((Py_UCS1*)buf1) + start, end - start,
11008 buf2, len2, PY_SSIZE_T_MAX
11009 );
11010 break;
11011 case PyUnicode_2BYTE_KIND:
11012 iresult = ucs2lib_count(
11013 ((Py_UCS2*)buf1) + start, end - start,
11014 buf2, len2, PY_SSIZE_T_MAX
11015 );
11016 break;
11017 case PyUnicode_4BYTE_KIND:
11018 iresult = ucs4lib_count(
11019 ((Py_UCS4*)buf1) + start, end - start,
11020 buf2, len2, PY_SSIZE_T_MAX
11021 );
11022 break;
11023 default:
11024 assert(0); iresult = 0;
11025 }
11026
11027 result = PyLong_FromSsize_t(iresult);
11028
11029 if (kind1 != kind)
11030 PyMem_Free(buf1);
11031 if (kind2 != kind)
11032 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033
11034 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011035
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 return result;
11037}
11038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011039PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011040 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011042Encode S using the codec registered for encoding. Default encoding\n\
11043is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011044handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011045a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11046'xmlcharrefreplace' as well as any other name registered with\n\
11047codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048
11049static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011050unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011052 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 char *encoding = NULL;
11054 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011055
Benjamin Peterson308d6372009-09-18 21:42:35 +000011056 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11057 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011059 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011060}
11061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011062PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011063 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064\n\
11065Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011066If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
11068static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011069unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011071 Py_ssize_t i, j, line_pos, src_len, incr;
11072 Py_UCS4 ch;
11073 PyObject *u;
11074 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011076 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011077 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078
11079 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081
Antoine Pitrou22425222011-10-04 19:10:51 +020011082 if (PyUnicode_READY(self) == -1)
11083 return NULL;
11084
Thomas Wouters7e474022000-07-16 12:04:32 +000011085 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011086 src_len = PyUnicode_GET_LENGTH(self);
11087 i = j = line_pos = 0;
11088 kind = PyUnicode_KIND(self);
11089 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011090 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011091 for (; i < src_len; i++) {
11092 ch = PyUnicode_READ(kind, src_data, i);
11093 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011094 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011095 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011096 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011097 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011098 goto overflow;
11099 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011100 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011101 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011104 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011105 goto overflow;
11106 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011108 if (ch == '\n' || ch == '\r')
11109 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011111 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011112 if (!found)
11113 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011114
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011116 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117 if (!u)
11118 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011119 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120
Antoine Pitroue71d5742011-10-04 15:55:09 +020011121 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122
Antoine Pitroue71d5742011-10-04 15:55:09 +020011123 for (; i < src_len; i++) {
11124 ch = PyUnicode_READ(kind, src_data, i);
11125 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011126 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011127 incr = tabsize - (line_pos % tabsize);
11128 line_pos += incr;
11129 while (incr--) {
11130 PyUnicode_WRITE(kind, dest_data, j, ' ');
11131 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011132 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011133 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011134 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011135 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011136 line_pos++;
11137 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011138 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011139 if (ch == '\n' || ch == '\r')
11140 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011142 }
11143 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011144 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011145
Antoine Pitroue71d5742011-10-04 15:55:09 +020011146 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011147 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11148 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149}
11150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011151PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153\n\
11154Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011155such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156arguments start and end are interpreted as in slice notation.\n\
11157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011158Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159
11160static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011163 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011164 Py_ssize_t start;
11165 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
Jesus Ceaac451502011-04-20 17:09:23 +020011168 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11169 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (PyUnicode_READY(self) == -1)
11173 return NULL;
11174 if (PyUnicode_READY(substring) == -1)
11175 return NULL;
11176
Victor Stinner7931d9a2011-11-04 00:22:48 +010011177 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178
11179 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 if (result == -2)
11182 return NULL;
11183
Christian Heimes217cfd12007-12-02 14:31:20 +000011184 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185}
11186
11187static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011188unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011190 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11191 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194}
11195
Guido van Rossumc2504932007-09-18 19:42:40 +000011196/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011197 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011198static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011199unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200{
Guido van Rossumc2504932007-09-18 19:42:40 +000011201 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011202 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 if (_PyUnicode_HASH(self) != -1)
11205 return _PyUnicode_HASH(self);
11206 if (PyUnicode_READY(self) == -1)
11207 return -1;
11208 len = PyUnicode_GET_LENGTH(self);
11209
11210 /* The hash function as a macro, gets expanded three times below. */
11211#define HASH(P) \
11212 x = (Py_uhash_t)*P << 7; \
11213 while (--len >= 0) \
Gregory P. Smithf5b62a92012-01-14 15:45:13 -080011214 x = (_PyHASH_MULTIPLIER*x) ^ (Py_uhash_t)*P++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215
11216 switch (PyUnicode_KIND(self)) {
11217 case PyUnicode_1BYTE_KIND: {
11218 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11219 HASH(c);
11220 break;
11221 }
11222 case PyUnicode_2BYTE_KIND: {
11223 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11224 HASH(s);
11225 break;
11226 }
11227 default: {
11228 Py_UCS4 *l;
11229 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11230 "Impossible switch case in unicode_hash");
11231 l = PyUnicode_4BYTE_DATA(self);
11232 HASH(l);
11233 break;
11234 }
11235 }
11236 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11237
Guido van Rossumc2504932007-09-18 19:42:40 +000011238 if (x == -1)
11239 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011241 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011243#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011245PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011248Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249
11250static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011253 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011254 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011255 Py_ssize_t start;
11256 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
Jesus Ceaac451502011-04-20 17:09:23 +020011258 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11259 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 if (PyUnicode_READY(self) == -1)
11263 return NULL;
11264 if (PyUnicode_READY(substring) == -1)
11265 return NULL;
11266
Victor Stinner7931d9a2011-11-04 00:22:48 +010011267 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268
11269 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 if (result == -2)
11272 return NULL;
11273
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 if (result < 0) {
11275 PyErr_SetString(PyExc_ValueError, "substring not found");
11276 return NULL;
11277 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011278
Christian Heimes217cfd12007-12-02 14:31:20 +000011279 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280}
11281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011285Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011286at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
11288static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011289unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 Py_ssize_t i, length;
11292 int kind;
11293 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 int cased;
11295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 if (PyUnicode_READY(self) == -1)
11297 return NULL;
11298 length = PyUnicode_GET_LENGTH(self);
11299 kind = PyUnicode_KIND(self);
11300 data = PyUnicode_DATA(self);
11301
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 if (length == 1)
11304 return PyBool_FromLong(
11305 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011307 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011310
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312 for (i = 0; i < length; i++) {
11313 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011314
Benjamin Peterson29060642009-01-31 22:14:21 +000011315 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11316 return PyBool_FromLong(0);
11317 else if (!cased && Py_UNICODE_ISLOWER(ch))
11318 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011320 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321}
11322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011323PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011324 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011326Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011327at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328
11329static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011330unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 Py_ssize_t i, length;
11333 int kind;
11334 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335 int cased;
11336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 if (PyUnicode_READY(self) == -1)
11338 return NULL;
11339 length = PyUnicode_GET_LENGTH(self);
11340 kind = PyUnicode_KIND(self);
11341 data = PyUnicode_DATA(self);
11342
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 if (length == 1)
11345 return PyBool_FromLong(
11346 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011348 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011351
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 for (i = 0; i < length; i++) {
11354 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011355
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11357 return PyBool_FromLong(0);
11358 else if (!cased && Py_UNICODE_ISUPPER(ch))
11359 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011361 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362}
11363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011364PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011367Return True if S is a titlecased string and there is at least one\n\
11368character in S, i.e. upper- and titlecase characters may only\n\
11369follow uncased characters and lowercase characters only cased ones.\n\
11370Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
11372static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011373unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 Py_ssize_t i, length;
11376 int kind;
11377 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378 int cased, previous_is_cased;
11379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 if (PyUnicode_READY(self) == -1)
11381 return NULL;
11382 length = PyUnicode_GET_LENGTH(self);
11383 kind = PyUnicode_KIND(self);
11384 data = PyUnicode_DATA(self);
11385
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 if (length == 1) {
11388 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11389 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11390 (Py_UNICODE_ISUPPER(ch) != 0));
11391 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011393 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011395 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011396
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397 cased = 0;
11398 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 for (i = 0; i < length; i++) {
11400 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011401
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11403 if (previous_is_cased)
11404 return PyBool_FromLong(0);
11405 previous_is_cased = 1;
11406 cased = 1;
11407 }
11408 else if (Py_UNICODE_ISLOWER(ch)) {
11409 if (!previous_is_cased)
11410 return PyBool_FromLong(0);
11411 previous_is_cased = 1;
11412 cased = 1;
11413 }
11414 else
11415 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011417 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418}
11419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011423Return True if all characters in S are whitespace\n\
11424and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011427unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 Py_ssize_t i, length;
11430 int kind;
11431 void *data;
11432
11433 if (PyUnicode_READY(self) == -1)
11434 return NULL;
11435 length = PyUnicode_GET_LENGTH(self);
11436 kind = PyUnicode_KIND(self);
11437 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 if (length == 1)
11441 return PyBool_FromLong(
11442 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011444 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 for (i = 0; i < length; i++) {
11449 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011450 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011453 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454}
11455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011456PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011458\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011459Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011461
11462static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011463unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 Py_ssize_t i, length;
11466 int kind;
11467 void *data;
11468
11469 if (PyUnicode_READY(self) == -1)
11470 return NULL;
11471 length = PyUnicode_GET_LENGTH(self);
11472 kind = PyUnicode_KIND(self);
11473 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011474
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011475 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 if (length == 1)
11477 return PyBool_FromLong(
11478 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011479
11480 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 for (i = 0; i < length; i++) {
11485 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011487 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011488 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011489}
11490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011491PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011493\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011494Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011495and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011496
11497static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011498unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 int kind;
11501 void *data;
11502 Py_ssize_t len, i;
11503
11504 if (PyUnicode_READY(self) == -1)
11505 return NULL;
11506
11507 kind = PyUnicode_KIND(self);
11508 data = PyUnicode_DATA(self);
11509 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011510
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011511 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 if (len == 1) {
11513 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11514 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11515 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011516
11517 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 for (i = 0; i < len; i++) {
11522 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011523 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011525 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011526 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011527}
11528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011529PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011532Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011533False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
11535static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011536unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 Py_ssize_t i, length;
11539 int kind;
11540 void *data;
11541
11542 if (PyUnicode_READY(self) == -1)
11543 return NULL;
11544 length = PyUnicode_GET_LENGTH(self);
11545 kind = PyUnicode_KIND(self);
11546 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 if (length == 1)
11550 return PyBool_FromLong(
11551 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011553 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 for (i = 0; i < length; i++) {
11558 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011561 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562}
11563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011564PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011567Return True if all characters in S are digits\n\
11568and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569
11570static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011571unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 Py_ssize_t i, length;
11574 int kind;
11575 void *data;
11576
11577 if (PyUnicode_READY(self) == -1)
11578 return NULL;
11579 length = PyUnicode_GET_LENGTH(self);
11580 kind = PyUnicode_KIND(self);
11581 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 if (length == 1) {
11585 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11586 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011589 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011591 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 for (i = 0; i < length; i++) {
11594 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011597 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598}
11599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011600PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011603Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
11606static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011607unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 Py_ssize_t i, length;
11610 int kind;
11611 void *data;
11612
11613 if (PyUnicode_READY(self) == -1)
11614 return NULL;
11615 length = PyUnicode_GET_LENGTH(self);
11616 kind = PyUnicode_KIND(self);
11617 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (length == 1)
11621 return PyBool_FromLong(
11622 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011624 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 for (i = 0; i < length; i++) {
11629 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011632 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633}
11634
Martin v. Löwis47383402007-08-15 07:32:56 +000011635int
11636PyUnicode_IsIdentifier(PyObject *self)
11637{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 int kind;
11639 void *data;
11640 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011641 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 if (PyUnicode_READY(self) == -1) {
11644 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 }
11647
11648 /* Special case for empty strings */
11649 if (PyUnicode_GET_LENGTH(self) == 0)
11650 return 0;
11651 kind = PyUnicode_KIND(self);
11652 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011653
11654 /* PEP 3131 says that the first character must be in
11655 XID_Start and subsequent characters in XID_Continue,
11656 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011657 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011658 letters, digits, underscore). However, given the current
11659 definition of XID_Start and XID_Continue, it is sufficient
11660 to check just for these, except that _ must be allowed
11661 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011663 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011664 return 0;
11665
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011666 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011669 return 1;
11670}
11671
11672PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011673 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011674\n\
11675Return True if S is a valid identifier according\n\
11676to the language definition.");
11677
11678static PyObject*
11679unicode_isidentifier(PyObject *self)
11680{
11681 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11682}
11683
Georg Brandl559e5d72008-06-11 18:37:52 +000011684PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011686\n\
11687Return True if all characters in S are considered\n\
11688printable in repr() or S is empty, False otherwise.");
11689
11690static PyObject*
11691unicode_isprintable(PyObject *self)
11692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 Py_ssize_t i, length;
11694 int kind;
11695 void *data;
11696
11697 if (PyUnicode_READY(self) == -1)
11698 return NULL;
11699 length = PyUnicode_GET_LENGTH(self);
11700 kind = PyUnicode_KIND(self);
11701 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011702
11703 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (length == 1)
11705 return PyBool_FromLong(
11706 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 for (i = 0; i < length; i++) {
11709 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011710 Py_RETURN_FALSE;
11711 }
11712 }
11713 Py_RETURN_TRUE;
11714}
11715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011716PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011717 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718\n\
11719Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011720iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
11722static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011723unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011725 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726}
11727
Martin v. Löwis18e16552006-02-15 17:27:45 +000011728static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011729unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 if (PyUnicode_READY(self) == -1)
11732 return -1;
11733 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734}
11735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011736PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011737 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011739Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011740done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
11742static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011743unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011745 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 Py_UCS4 fillchar = ' ';
11747
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011748 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 return NULL;
11750
Benjamin Petersonbac79492012-01-14 13:34:47 -050011751 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
Victor Stinnerc4b49542011-12-11 22:44:26 +010011754 if (PyUnicode_GET_LENGTH(self) >= width)
11755 return unicode_result_unchanged(self);
11756
11757 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758}
11759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011760PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011763Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
11765static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011766unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011768 if (PyUnicode_READY(self) == -1)
11769 return NULL;
11770 if (PyUnicode_IS_ASCII(self))
11771 return ascii_upper_or_lower(self, 1);
11772 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773}
11774
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011775#define LEFTSTRIP 0
11776#define RIGHTSTRIP 1
11777#define BOTHSTRIP 2
11778
11779/* Arrays indexed by above */
11780static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11781
11782#define STRIPNAME(i) (stripformat[i]+3)
11783
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011784/* externally visible for str.strip(unicode) */
11785PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011786_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 void *data;
11789 int kind;
11790 Py_ssize_t i, j, len;
11791 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11794 return NULL;
11795
11796 kind = PyUnicode_KIND(self);
11797 data = PyUnicode_DATA(self);
11798 len = PyUnicode_GET_LENGTH(self);
11799 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11800 PyUnicode_DATA(sepobj),
11801 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011802
Benjamin Peterson14339b62009-01-31 16:36:08 +000011803 i = 0;
11804 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 while (i < len &&
11806 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 i++;
11808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011809 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810
Benjamin Peterson14339b62009-01-31 16:36:08 +000011811 j = len;
11812 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 do {
11814 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 } while (j >= i &&
11816 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011817 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011818 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011819
Victor Stinner7931d9a2011-11-04 00:22:48 +010011820 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821}
11822
11823PyObject*
11824PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11825{
11826 unsigned char *data;
11827 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011828 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829
Victor Stinnerde636f32011-10-01 03:55:54 +020011830 if (PyUnicode_READY(self) == -1)
11831 return NULL;
11832
11833 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11834
Victor Stinner12bab6d2011-10-01 01:53:49 +020011835 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011836 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837
Victor Stinner12bab6d2011-10-01 01:53:49 +020011838 length = end - start;
11839 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011840 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841
Victor Stinnerde636f32011-10-01 03:55:54 +020011842 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011843 PyErr_SetString(PyExc_IndexError, "string index out of range");
11844 return NULL;
11845 }
11846
Victor Stinnerb9275c12011-10-05 14:01:42 +020011847 if (PyUnicode_IS_ASCII(self)) {
11848 kind = PyUnicode_KIND(self);
11849 data = PyUnicode_1BYTE_DATA(self);
11850 return unicode_fromascii(data + start, length);
11851 }
11852 else {
11853 kind = PyUnicode_KIND(self);
11854 data = PyUnicode_1BYTE_DATA(self);
11855 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011856 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011857 length);
11858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860
11861static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011862do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 int kind;
11865 void *data;
11866 Py_ssize_t len, i, j;
11867
11868 if (PyUnicode_READY(self) == -1)
11869 return NULL;
11870
11871 kind = PyUnicode_KIND(self);
11872 data = PyUnicode_DATA(self);
11873 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011874
Benjamin Peterson14339b62009-01-31 16:36:08 +000011875 i = 0;
11876 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011878 i++;
11879 }
11880 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011881
Benjamin Peterson14339b62009-01-31 16:36:08 +000011882 j = len;
11883 if (striptype != LEFTSTRIP) {
11884 do {
11885 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011887 j++;
11888 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011889
Victor Stinner7931d9a2011-11-04 00:22:48 +010011890 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891}
11892
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011893
11894static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011895do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011896{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011897 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011898
Benjamin Peterson14339b62009-01-31 16:36:08 +000011899 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11900 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011901
Benjamin Peterson14339b62009-01-31 16:36:08 +000011902 if (sep != NULL && sep != Py_None) {
11903 if (PyUnicode_Check(sep))
11904 return _PyUnicode_XStrip(self, striptype, sep);
11905 else {
11906 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 "%s arg must be None or str",
11908 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011909 return NULL;
11910 }
11911 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011912
Benjamin Peterson14339b62009-01-31 16:36:08 +000011913 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011914}
11915
11916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011917PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011919\n\
11920Return a copy of the string S with leading and trailing\n\
11921whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011922If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011923
11924static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011925unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011926{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011927 if (PyTuple_GET_SIZE(args) == 0)
11928 return do_strip(self, BOTHSTRIP); /* Common case */
11929 else
11930 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011931}
11932
11933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011934PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011936\n\
11937Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011938If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011939
11940static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011941unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011942{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011943 if (PyTuple_GET_SIZE(args) == 0)
11944 return do_strip(self, LEFTSTRIP); /* Common case */
11945 else
11946 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011947}
11948
11949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011950PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011952\n\
11953Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011954If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011955
11956static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011957unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011958{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011959 if (PyTuple_GET_SIZE(args) == 0)
11960 return do_strip(self, RIGHTSTRIP); /* Common case */
11961 else
11962 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011963}
11964
11965
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011967unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011969 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
Georg Brandl222de0f2009-04-12 12:01:50 +000011972 if (len < 1) {
11973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011974 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
Victor Stinnerc4b49542011-12-11 22:44:26 +010011977 /* no repeat, return original string */
11978 if (len == 1)
11979 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011980
Benjamin Petersonbac79492012-01-14 13:34:47 -050011981 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 return NULL;
11983
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011984 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011985 PyErr_SetString(PyExc_OverflowError,
11986 "repeated string is too long");
11987 return NULL;
11988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011990
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011991 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992 if (!u)
11993 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011994 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (PyUnicode_GET_LENGTH(str) == 1) {
11997 const int kind = PyUnicode_KIND(str);
11998 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011999 if (kind == PyUnicode_1BYTE_KIND) {
12000 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012001 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012002 }
12003 else if (kind == PyUnicode_2BYTE_KIND) {
12004 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012005 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012006 ucs2[n] = fill_char;
12007 } else {
12008 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12009 assert(kind == PyUnicode_4BYTE_KIND);
12010 for (n = 0; n < len; ++n)
12011 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 }
12014 else {
12015 /* number of characters copied this far */
12016 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012017 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 char *to = (char *) PyUnicode_DATA(u);
12019 Py_MEMCPY(to, PyUnicode_DATA(str),
12020 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 n = (done <= nchars-done) ? done : nchars-done;
12023 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012024 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026 }
12027
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012028 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012029 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030}
12031
Alexander Belopolsky40018472011-02-26 01:02:56 +000012032PyObject *
12033PyUnicode_Replace(PyObject *obj,
12034 PyObject *subobj,
12035 PyObject *replobj,
12036 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037{
12038 PyObject *self;
12039 PyObject *str1;
12040 PyObject *str2;
12041 PyObject *result;
12042
12043 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012044 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012047 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 Py_DECREF(self);
12049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050 }
12051 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012052 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 Py_DECREF(self);
12054 Py_DECREF(str1);
12055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012057 if (PyUnicode_READY(self) == -1 ||
12058 PyUnicode_READY(str1) == -1 ||
12059 PyUnicode_READY(str2) == -1)
12060 result = NULL;
12061 else
12062 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063 Py_DECREF(self);
12064 Py_DECREF(str1);
12065 Py_DECREF(str2);
12066 return result;
12067}
12068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012069PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012070 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071\n\
12072Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012073old replaced by new. If the optional argument count is\n\
12074given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075
12076static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 PyObject *str1;
12080 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012081 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082 PyObject *result;
12083
Martin v. Löwis18e16552006-02-15 17:27:45 +000012084 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012086 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012089 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 return NULL;
12091 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012092 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 Py_DECREF(str1);
12094 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012095 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012096 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12097 result = NULL;
12098 else
12099 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100
12101 Py_DECREF(str1);
12102 Py_DECREF(str2);
12103 return result;
12104}
12105
Alexander Belopolsky40018472011-02-26 01:02:56 +000012106static PyObject *
12107unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012109 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 Py_ssize_t isize;
12111 Py_ssize_t osize, squote, dquote, i, o;
12112 Py_UCS4 max, quote;
12113 int ikind, okind;
12114 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012117 return NULL;
12118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 isize = PyUnicode_GET_LENGTH(unicode);
12120 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 /* Compute length of output, quote characters, and
12123 maximum character */
12124 osize = 2; /* quotes */
12125 max = 127;
12126 squote = dquote = 0;
12127 ikind = PyUnicode_KIND(unicode);
12128 for (i = 0; i < isize; i++) {
12129 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12130 switch (ch) {
12131 case '\'': squote++; osize++; break;
12132 case '"': dquote++; osize++; break;
12133 case '\\': case '\t': case '\r': case '\n':
12134 osize += 2; break;
12135 default:
12136 /* Fast-path ASCII */
12137 if (ch < ' ' || ch == 0x7f)
12138 osize += 4; /* \xHH */
12139 else if (ch < 0x7f)
12140 osize++;
12141 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12142 osize++;
12143 max = ch > max ? ch : max;
12144 }
12145 else if (ch < 0x100)
12146 osize += 4; /* \xHH */
12147 else if (ch < 0x10000)
12148 osize += 6; /* \uHHHH */
12149 else
12150 osize += 10; /* \uHHHHHHHH */
12151 }
12152 }
12153
12154 quote = '\'';
12155 if (squote) {
12156 if (dquote)
12157 /* Both squote and dquote present. Use squote,
12158 and escape them */
12159 osize += squote;
12160 else
12161 quote = '"';
12162 }
12163
12164 repr = PyUnicode_New(osize, max);
12165 if (repr == NULL)
12166 return NULL;
12167 okind = PyUnicode_KIND(repr);
12168 odata = PyUnicode_DATA(repr);
12169
12170 PyUnicode_WRITE(okind, odata, 0, quote);
12171 PyUnicode_WRITE(okind, odata, osize-1, quote);
12172
12173 for (i = 0, o = 1; i < isize; i++) {
12174 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012175
12176 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 if ((ch == quote) || (ch == '\\')) {
12178 PyUnicode_WRITE(okind, odata, o++, '\\');
12179 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012180 continue;
12181 }
12182
Benjamin Peterson29060642009-01-31 22:14:21 +000012183 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012184 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 PyUnicode_WRITE(okind, odata, o++, '\\');
12186 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012187 }
12188 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 PyUnicode_WRITE(okind, odata, o++, '\\');
12190 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012191 }
12192 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 PyUnicode_WRITE(okind, odata, o++, '\\');
12194 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012195 }
12196
12197 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012198 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 PyUnicode_WRITE(okind, odata, o++, '\\');
12200 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012201 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12202 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012203 }
12204
Georg Brandl559e5d72008-06-11 18:37:52 +000012205 /* Copy ASCII characters as-is */
12206 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012208 }
12209
Benjamin Peterson29060642009-01-31 22:14:21 +000012210 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012211 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012212 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012213 (categories Z* and C* except ASCII space)
12214 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012216 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 if (ch <= 0xff) {
12218 PyUnicode_WRITE(okind, odata, o++, '\\');
12219 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012220 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12221 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012222 }
12223 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 else if (ch >= 0x10000) {
12225 PyUnicode_WRITE(okind, odata, o++, '\\');
12226 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012227 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12228 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12229 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12230 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12231 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12232 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12233 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12234 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012235 }
12236 /* Map 16-bit characters to '\uxxxx' */
12237 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 PyUnicode_WRITE(okind, odata, o++, '\\');
12239 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012240 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12241 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12242 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12243 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012244 }
12245 }
12246 /* Copy characters as-is */
12247 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012249 }
12250 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012253 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012254 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255}
12256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012257PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259\n\
12260Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012261such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262arguments start and end are interpreted as in slice notation.\n\
12263\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012264Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265
12266static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012269 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012270 Py_ssize_t start;
12271 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273
Jesus Ceaac451502011-04-20 17:09:23 +020012274 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12275 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 if (PyUnicode_READY(self) == -1)
12279 return NULL;
12280 if (PyUnicode_READY(substring) == -1)
12281 return NULL;
12282
Victor Stinner7931d9a2011-11-04 00:22:48 +010012283 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284
12285 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287 if (result == -2)
12288 return NULL;
12289
Christian Heimes217cfd12007-12-02 14:31:20 +000012290 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291}
12292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012293PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012296Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297
12298static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012301 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012302 Py_ssize_t start;
12303 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012304 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305
Jesus Ceaac451502011-04-20 17:09:23 +020012306 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12307 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (PyUnicode_READY(self) == -1)
12311 return NULL;
12312 if (PyUnicode_READY(substring) == -1)
12313 return NULL;
12314
Victor Stinner7931d9a2011-11-04 00:22:48 +010012315 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316
12317 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 if (result == -2)
12320 return NULL;
12321
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322 if (result < 0) {
12323 PyErr_SetString(PyExc_ValueError, "substring not found");
12324 return NULL;
12325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326
Christian Heimes217cfd12007-12-02 14:31:20 +000012327 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328}
12329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012330PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012333Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012334done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335
12336static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012337unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012339 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 Py_UCS4 fillchar = ' ';
12341
Victor Stinnere9a29352011-10-01 02:14:59 +020012342 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012344
Benjamin Petersonbac79492012-01-14 13:34:47 -050012345 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346 return NULL;
12347
Victor Stinnerc4b49542011-12-11 22:44:26 +010012348 if (PyUnicode_GET_LENGTH(self) >= width)
12349 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350
Victor Stinnerc4b49542011-12-11 22:44:26 +010012351 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352}
12353
Alexander Belopolsky40018472011-02-26 01:02:56 +000012354PyObject *
12355PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356{
12357 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012358
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359 s = PyUnicode_FromObject(s);
12360 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012361 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 if (sep != NULL) {
12363 sep = PyUnicode_FromObject(sep);
12364 if (sep == NULL) {
12365 Py_DECREF(s);
12366 return NULL;
12367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368 }
12369
Victor Stinner9310abb2011-10-05 00:59:23 +020012370 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371
12372 Py_DECREF(s);
12373 Py_XDECREF(sep);
12374 return result;
12375}
12376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012377PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379\n\
12380Return a list of the words in S, using sep as the\n\
12381delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012382splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012383whitespace string is a separator and empty strings are\n\
12384removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385
12386static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012387unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388{
12389 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012390 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391
Martin v. Löwis18e16552006-02-15 17:27:45 +000012392 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393 return NULL;
12394
12395 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012398 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012400 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401}
12402
Thomas Wouters477c8d52006-05-27 19:21:47 +000012403PyObject *
12404PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12405{
12406 PyObject* str_obj;
12407 PyObject* sep_obj;
12408 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 int kind1, kind2, kind;
12410 void *buf1 = NULL, *buf2 = NULL;
12411 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012412
12413 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012414 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012415 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012416 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012417 if (!sep_obj) {
12418 Py_DECREF(str_obj);
12419 return NULL;
12420 }
12421 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12422 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012423 Py_DECREF(str_obj);
12424 return NULL;
12425 }
12426
Victor Stinner14f8f022011-10-05 20:58:25 +020012427 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012429 kind = Py_MAX(kind1, kind2);
12430 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012432 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 if (!buf1)
12434 goto onError;
12435 buf2 = PyUnicode_DATA(sep_obj);
12436 if (kind2 != kind)
12437 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12438 if (!buf2)
12439 goto onError;
12440 len1 = PyUnicode_GET_LENGTH(str_obj);
12441 len2 = PyUnicode_GET_LENGTH(sep_obj);
12442
Benjamin Petersonead6b532011-12-20 17:23:42 -060012443 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012445 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12446 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12447 else
12448 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 break;
12450 case PyUnicode_2BYTE_KIND:
12451 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12452 break;
12453 case PyUnicode_4BYTE_KIND:
12454 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12455 break;
12456 default:
12457 assert(0);
12458 out = 0;
12459 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012460
12461 Py_DECREF(sep_obj);
12462 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 if (kind1 != kind)
12464 PyMem_Free(buf1);
12465 if (kind2 != kind)
12466 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012467
12468 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 onError:
12470 Py_DECREF(sep_obj);
12471 Py_DECREF(str_obj);
12472 if (kind1 != kind && buf1)
12473 PyMem_Free(buf1);
12474 if (kind2 != kind && buf2)
12475 PyMem_Free(buf2);
12476 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012477}
12478
12479
12480PyObject *
12481PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12482{
12483 PyObject* str_obj;
12484 PyObject* sep_obj;
12485 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 int kind1, kind2, kind;
12487 void *buf1 = NULL, *buf2 = NULL;
12488 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012489
12490 str_obj = PyUnicode_FromObject(str_in);
12491 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012493 sep_obj = PyUnicode_FromObject(sep_in);
12494 if (!sep_obj) {
12495 Py_DECREF(str_obj);
12496 return NULL;
12497 }
12498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 kind1 = PyUnicode_KIND(str_in);
12500 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012501 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 buf1 = PyUnicode_DATA(str_in);
12503 if (kind1 != kind)
12504 buf1 = _PyUnicode_AsKind(str_in, kind);
12505 if (!buf1)
12506 goto onError;
12507 buf2 = PyUnicode_DATA(sep_obj);
12508 if (kind2 != kind)
12509 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12510 if (!buf2)
12511 goto onError;
12512 len1 = PyUnicode_GET_LENGTH(str_obj);
12513 len2 = PyUnicode_GET_LENGTH(sep_obj);
12514
Benjamin Petersonead6b532011-12-20 17:23:42 -060012515 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012517 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12518 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12519 else
12520 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 break;
12522 case PyUnicode_2BYTE_KIND:
12523 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12524 break;
12525 case PyUnicode_4BYTE_KIND:
12526 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12527 break;
12528 default:
12529 assert(0);
12530 out = 0;
12531 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012532
12533 Py_DECREF(sep_obj);
12534 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 if (kind1 != kind)
12536 PyMem_Free(buf1);
12537 if (kind2 != kind)
12538 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012539
12540 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 onError:
12542 Py_DECREF(sep_obj);
12543 Py_DECREF(str_obj);
12544 if (kind1 != kind && buf1)
12545 PyMem_Free(buf1);
12546 if (kind2 != kind && buf2)
12547 PyMem_Free(buf2);
12548 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012549}
12550
12551PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012552 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012553\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012554Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012555the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012556found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012557
12558static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012559unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012560{
Victor Stinner9310abb2011-10-05 00:59:23 +020012561 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012562}
12563
12564PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012565 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012566\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012567Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012568the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012569separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012570
12571static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012572unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012573{
Victor Stinner9310abb2011-10-05 00:59:23 +020012574 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012575}
12576
Alexander Belopolsky40018472011-02-26 01:02:56 +000012577PyObject *
12578PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012579{
12580 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012581
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012582 s = PyUnicode_FromObject(s);
12583 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012584 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012585 if (sep != NULL) {
12586 sep = PyUnicode_FromObject(sep);
12587 if (sep == NULL) {
12588 Py_DECREF(s);
12589 return NULL;
12590 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012591 }
12592
Victor Stinner9310abb2011-10-05 00:59:23 +020012593 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012594
12595 Py_DECREF(s);
12596 Py_XDECREF(sep);
12597 return result;
12598}
12599
12600PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012602\n\
12603Return a list of the words in S, using sep as the\n\
12604delimiter string, starting at the end of the string and\n\
12605working to the front. If maxsplit is given, at most maxsplit\n\
12606splits are done. If sep is not specified, any whitespace string\n\
12607is a separator.");
12608
12609static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012610unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012611{
12612 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012613 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012614
Martin v. Löwis18e16552006-02-15 17:27:45 +000012615 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012616 return NULL;
12617
12618 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012619 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012620 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012621 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012622 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012623 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012624}
12625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012626PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012627 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628\n\
12629Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012630Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012631is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632
12633static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012634unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012636 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012637 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012639 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12640 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641 return NULL;
12642
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012643 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644}
12645
12646static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012647PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012649 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650}
12651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012652PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654\n\
12655Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012656and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657
12658static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012659unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012661 if (PyUnicode_READY(self) == -1)
12662 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012663 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664}
12665
Georg Brandlceee0772007-11-27 23:48:05 +000012666PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012668\n\
12669Return a translation table usable for str.translate().\n\
12670If there is only one argument, it must be a dictionary mapping Unicode\n\
12671ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012672Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012673If there are two arguments, they must be strings of equal length, and\n\
12674in the resulting dictionary, each character in x will be mapped to the\n\
12675character at the same position in y. If there is a third argument, it\n\
12676must be a string, whose characters will be mapped to None in the result.");
12677
12678static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012679unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012680{
12681 PyObject *x, *y = NULL, *z = NULL;
12682 PyObject *new = NULL, *key, *value;
12683 Py_ssize_t i = 0;
12684 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012685
Georg Brandlceee0772007-11-27 23:48:05 +000012686 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12687 return NULL;
12688 new = PyDict_New();
12689 if (!new)
12690 return NULL;
12691 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 int x_kind, y_kind, z_kind;
12693 void *x_data, *y_data, *z_data;
12694
Georg Brandlceee0772007-11-27 23:48:05 +000012695 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012696 if (!PyUnicode_Check(x)) {
12697 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12698 "be a string if there is a second argument");
12699 goto err;
12700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012702 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12703 "arguments must have equal length");
12704 goto err;
12705 }
12706 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 x_kind = PyUnicode_KIND(x);
12708 y_kind = PyUnicode_KIND(y);
12709 x_data = PyUnicode_DATA(x);
12710 y_data = PyUnicode_DATA(y);
12711 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12712 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012713 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012714 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012715 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012716 if (!value) {
12717 Py_DECREF(key);
12718 goto err;
12719 }
Georg Brandlceee0772007-11-27 23:48:05 +000012720 res = PyDict_SetItem(new, key, value);
12721 Py_DECREF(key);
12722 Py_DECREF(value);
12723 if (res < 0)
12724 goto err;
12725 }
12726 /* create entries for deleting chars in z */
12727 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 z_kind = PyUnicode_KIND(z);
12729 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012730 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012732 if (!key)
12733 goto err;
12734 res = PyDict_SetItem(new, key, Py_None);
12735 Py_DECREF(key);
12736 if (res < 0)
12737 goto err;
12738 }
12739 }
12740 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 int kind;
12742 void *data;
12743
Georg Brandlceee0772007-11-27 23:48:05 +000012744 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012745 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012746 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12747 "to maketrans it must be a dict");
12748 goto err;
12749 }
12750 /* copy entries into the new dict, converting string keys to int keys */
12751 while (PyDict_Next(x, &i, &key, &value)) {
12752 if (PyUnicode_Check(key)) {
12753 /* convert string keys to integer keys */
12754 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012755 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012756 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12757 "table must be of length 1");
12758 goto err;
12759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 kind = PyUnicode_KIND(key);
12761 data = PyUnicode_DATA(key);
12762 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012763 if (!newkey)
12764 goto err;
12765 res = PyDict_SetItem(new, newkey, value);
12766 Py_DECREF(newkey);
12767 if (res < 0)
12768 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012769 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012770 /* just keep integer keys */
12771 if (PyDict_SetItem(new, key, value) < 0)
12772 goto err;
12773 } else {
12774 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12775 "be strings or integers");
12776 goto err;
12777 }
12778 }
12779 }
12780 return new;
12781 err:
12782 Py_DECREF(new);
12783 return NULL;
12784}
12785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012786PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788\n\
12789Return a copy of the string S, where all characters have been mapped\n\
12790through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012791Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012792Unmapped characters are left untouched. Characters mapped to None\n\
12793are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794
12795static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799}
12800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012801PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012804Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
12806static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012807unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012809 if (PyUnicode_READY(self) == -1)
12810 return NULL;
12811 if (PyUnicode_IS_ASCII(self))
12812 return ascii_upper_or_lower(self, 0);
12813 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814}
12815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012816PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012817 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012819Pad a numeric string S with zeros on the left, to fill a field\n\
12820of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821
12822static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012823unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012825 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012826 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012827 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 int kind;
12829 void *data;
12830 Py_UCS4 chr;
12831
Martin v. Löwis18e16552006-02-15 17:27:45 +000012832 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833 return NULL;
12834
Benjamin Petersonbac79492012-01-14 13:34:47 -050012835 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837
Victor Stinnerc4b49542011-12-11 22:44:26 +010012838 if (PyUnicode_GET_LENGTH(self) >= width)
12839 return unicode_result_unchanged(self);
12840
12841 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842
12843 u = pad(self, fill, 0, '0');
12844
Walter Dörwald068325e2002-04-15 13:36:47 +000012845 if (u == NULL)
12846 return NULL;
12847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848 kind = PyUnicode_KIND(u);
12849 data = PyUnicode_DATA(u);
12850 chr = PyUnicode_READ(kind, data, fill);
12851
12852 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 PyUnicode_WRITE(kind, data, 0, chr);
12855 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856 }
12857
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012858 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012859 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861
12862#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012863static PyObject *
12864unicode__decimal2ascii(PyObject *self)
12865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012867}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868#endif
12869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012870PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012873Return True if S starts with the specified prefix, False otherwise.\n\
12874With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012875With optional end, stop comparing S at that position.\n\
12876prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877
12878static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012879unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012882 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012883 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012884 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012885 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012886 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887
Jesus Ceaac451502011-04-20 17:09:23 +020012888 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012889 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012890 if (PyTuple_Check(subobj)) {
12891 Py_ssize_t i;
12892 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012893 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012894 if (substring == NULL)
12895 return NULL;
12896 result = tailmatch(self, substring, start, end, -1);
12897 Py_DECREF(substring);
12898 if (result) {
12899 Py_RETURN_TRUE;
12900 }
12901 }
12902 /* nothing matched */
12903 Py_RETURN_FALSE;
12904 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012905 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012906 if (substring == NULL) {
12907 if (PyErr_ExceptionMatches(PyExc_TypeError))
12908 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12909 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012910 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012911 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012912 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012914 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915}
12916
12917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012918PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012919 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012920\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012921Return True if S ends with the specified suffix, False otherwise.\n\
12922With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012923With optional end, stop comparing S at that position.\n\
12924suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925
12926static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012927unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012930 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012931 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012932 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012933 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012934 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935
Jesus Ceaac451502011-04-20 17:09:23 +020012936 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012937 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012938 if (PyTuple_Check(subobj)) {
12939 Py_ssize_t i;
12940 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012941 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012943 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012944 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012945 result = tailmatch(self, substring, start, end, +1);
12946 Py_DECREF(substring);
12947 if (result) {
12948 Py_RETURN_TRUE;
12949 }
12950 }
12951 Py_RETURN_FALSE;
12952 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012953 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012954 if (substring == NULL) {
12955 if (PyErr_ExceptionMatches(PyExc_TypeError))
12956 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12957 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012958 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012959 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012960 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012962 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963}
12964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012966
12967PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012969\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012970Return a formatted version of S, using substitutions from args and kwargs.\n\
12971The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012972
Eric Smith27bbca62010-11-04 17:06:58 +000012973PyDoc_STRVAR(format_map__doc__,
12974 "S.format_map(mapping) -> str\n\
12975\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012976Return a formatted version of S, using substitutions from mapping.\n\
12977The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012978
Eric Smith4a7d76d2008-05-30 18:10:19 +000012979static PyObject *
12980unicode__format__(PyObject* self, PyObject* args)
12981{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012982 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012983
12984 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12985 return NULL;
12986
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012987 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012989 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012990}
12991
Eric Smith8c663262007-08-25 02:26:07 +000012992PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012993 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012994\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012995Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012996
12997static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012998unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 Py_ssize_t size;
13001
13002 /* If it's a compact object, account for base structure +
13003 character data. */
13004 if (PyUnicode_IS_COMPACT_ASCII(v))
13005 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13006 else if (PyUnicode_IS_COMPACT(v))
13007 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013008 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 else {
13010 /* If it is a two-block object, account for base object, and
13011 for character block if present. */
13012 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013013 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013015 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 }
13017 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013018 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013019 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013021 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013022 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023
13024 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013025}
13026
13027PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013028 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013029
13030static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013031unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013032{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013033 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 if (!copy)
13035 return NULL;
13036 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013037}
13038
Guido van Rossumd57fd912000-03-10 22:53:23 +000013039static PyMethodDef unicode_methods[] = {
13040
13041 /* Order is according to common usage: often used methods should
13042 appear first, since lookup is done sequentially. */
13043
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013044 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013045 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13046 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013047 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013048 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13049 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013050 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013051 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13052 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13053 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13054 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13055 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013056 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013057 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13058 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13059 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013060 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013061 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13062 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13063 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013064 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013065 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013066 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013067 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013068 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13069 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13070 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13071 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13072 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13073 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13074 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13075 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13076 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13077 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13078 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13079 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13080 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13081 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013082 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013083 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013084 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013085 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013086 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013087 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013088 {"maketrans", (PyCFunction) unicode_maketrans,
13089 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013090 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013091#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013092 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013093 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094#endif
13095
Benjamin Peterson14339b62009-01-31 16:36:08 +000013096 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097 {NULL, NULL}
13098};
13099
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013100static PyObject *
13101unicode_mod(PyObject *v, PyObject *w)
13102{
Brian Curtindfc80e32011-08-10 20:28:54 -050013103 if (!PyUnicode_Check(v))
13104 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013106}
13107
13108static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 0, /*nb_add*/
13110 0, /*nb_subtract*/
13111 0, /*nb_multiply*/
13112 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013113};
13114
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013116 (lenfunc) unicode_length, /* sq_length */
13117 PyUnicode_Concat, /* sq_concat */
13118 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13119 (ssizeargfunc) unicode_getitem, /* sq_item */
13120 0, /* sq_slice */
13121 0, /* sq_ass_item */
13122 0, /* sq_ass_slice */
13123 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124};
13125
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013126static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013127unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 if (PyUnicode_READY(self) == -1)
13130 return NULL;
13131
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013132 if (PyIndex_Check(item)) {
13133 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013134 if (i == -1 && PyErr_Occurred())
13135 return NULL;
13136 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013138 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013139 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013140 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013141 PyObject *result;
13142 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013143 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013144 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013146 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013148 return NULL;
13149 }
13150
13151 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013152 Py_INCREF(unicode_empty);
13153 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013155 slicelength == PyUnicode_GET_LENGTH(self)) {
13156 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013157 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013158 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013159 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013160 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013161 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013162 src_kind = PyUnicode_KIND(self);
13163 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013164 if (!PyUnicode_IS_ASCII(self)) {
13165 kind_limit = kind_maxchar_limit(src_kind);
13166 max_char = 0;
13167 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13168 ch = PyUnicode_READ(src_kind, src_data, cur);
13169 if (ch > max_char) {
13170 max_char = ch;
13171 if (max_char >= kind_limit)
13172 break;
13173 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013174 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013175 }
Victor Stinner55c99112011-10-13 01:17:06 +020013176 else
13177 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013178 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013179 if (result == NULL)
13180 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013181 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013182 dest_data = PyUnicode_DATA(result);
13183
13184 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013185 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13186 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013187 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013188 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013189 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013190 } else {
13191 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13192 return NULL;
13193 }
13194}
13195
13196static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013197 (lenfunc)unicode_length, /* mp_length */
13198 (binaryfunc)unicode_subscript, /* mp_subscript */
13199 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013200};
13201
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203/* Helpers for PyUnicode_Format() */
13204
13205static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013206getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013208 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 (*p_argidx)++;
13211 if (arglen < 0)
13212 return args;
13213 else
13214 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215 }
13216 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013217 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218 return NULL;
13219}
13220
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013221/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013223static PyObject *
13224formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013225{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013226 char *p;
13227 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013229
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230 x = PyFloat_AsDouble(v);
13231 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013232 return NULL;
13233
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013236
Eric Smith0923d1d2009-04-16 20:16:10 +000013237 p = PyOS_double_to_string(x, type, prec,
13238 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013239 if (p == NULL)
13240 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013241 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013242 PyMem_Free(p);
13243 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244}
13245
Tim Peters38fd5b62000-09-21 05:43:11 +000013246static PyObject*
13247formatlong(PyObject *val, int flags, int prec, int type)
13248{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013249 char *buf;
13250 int len;
13251 PyObject *str; /* temporary string object. */
13252 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013253
Benjamin Peterson14339b62009-01-31 16:36:08 +000013254 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13255 if (!str)
13256 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013258 Py_DECREF(str);
13259 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013260}
13261
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013262static Py_UCS4
13263formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013265 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013266 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013267 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013268 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013269 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013270 goto onError;
13271 }
13272 else {
13273 /* Integer input truncated to a character */
13274 long x;
13275 x = PyLong_AsLong(v);
13276 if (x == -1 && PyErr_Occurred())
13277 goto onError;
13278
Victor Stinner8faf8212011-12-08 22:14:11 +010013279 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 PyErr_SetString(PyExc_OverflowError,
13281 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013282 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 }
13284
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013285 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013286 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013287
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013289 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013290 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013291 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292}
13293
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013294static int
13295repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13296{
13297 int r;
13298 assert(count > 0);
13299 assert(PyUnicode_Check(obj));
13300 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013301 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013302 if (repeated == NULL)
13303 return -1;
13304 r = _PyAccu_Accumulate(acc, repeated);
13305 Py_DECREF(repeated);
13306 return r;
13307 }
13308 else {
13309 do {
13310 if (_PyAccu_Accumulate(acc, obj))
13311 return -1;
13312 } while (--count);
13313 return 0;
13314 }
13315}
13316
Alexander Belopolsky40018472011-02-26 01:02:56 +000013317PyObject *
13318PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 void *fmt;
13321 int fmtkind;
13322 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013323 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013324 int r;
13325 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013328 PyObject *temp = NULL;
13329 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013330 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013331 _PyAccu acc;
13332 static PyObject *plus, *minus, *blank, *zero, *percent;
13333
13334 if (!plus && !(plus = get_latin1_char('+')))
13335 return NULL;
13336 if (!minus && !(minus = get_latin1_char('-')))
13337 return NULL;
13338 if (!blank && !(blank = get_latin1_char(' ')))
13339 return NULL;
13340 if (!zero && !(zero = get_latin1_char('0')))
13341 return NULL;
13342 if (!percent && !(percent = get_latin1_char('%')))
13343 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013344
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 PyErr_BadInternalCall();
13347 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013349 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013350 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013351 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013352 if (PyUnicode_READY(uformat) == -1)
13353 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013354 if (_PyAccu_Init(&acc))
13355 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013356 fmt = PyUnicode_DATA(uformat);
13357 fmtkind = PyUnicode_KIND(uformat);
13358 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13359 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013362 arglen = PyTuple_Size(args);
13363 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364 }
13365 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013366 arglen = -1;
13367 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013369 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013370 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013371 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
13373 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013374 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013375 PyObject *nonfmt;
13376 Py_ssize_t nonfmtpos;
13377 nonfmtpos = fmtpos++;
13378 while (fmtcnt >= 0 &&
13379 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13380 fmtpos++;
13381 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013382 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013383 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013384 if (nonfmt == NULL)
13385 goto onError;
13386 r = _PyAccu_Accumulate(&acc, nonfmt);
13387 Py_DECREF(nonfmt);
13388 if (r)
13389 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013390 }
13391 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 /* Got a format specifier */
13393 int flags = 0;
13394 Py_ssize_t width = -1;
13395 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013396 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013397 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 int isnumok;
13399 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013400 void *pbuf = NULL;
13401 Py_ssize_t pindex, len;
13402 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013404 fmtpos++;
13405 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13406 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 Py_ssize_t keylen;
13408 PyObject *key;
13409 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013410
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 if (dict == NULL) {
13412 PyErr_SetString(PyExc_TypeError,
13413 "format requires a mapping");
13414 goto onError;
13415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013418 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 /* Skip over balanced parentheses */
13420 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013423 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013425 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013427 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 if (fmtcnt < 0 || pcount > 0) {
13429 PyErr_SetString(PyExc_ValueError,
13430 "incomplete format key");
13431 goto onError;
13432 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013433 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013434 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013435 if (key == NULL)
13436 goto onError;
13437 if (args_owned) {
13438 Py_DECREF(args);
13439 args_owned = 0;
13440 }
13441 args = PyObject_GetItem(dict, key);
13442 Py_DECREF(key);
13443 if (args == NULL) {
13444 goto onError;
13445 }
13446 args_owned = 1;
13447 arglen = -1;
13448 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013449 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013451 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013452 case '-': flags |= F_LJUST; continue;
13453 case '+': flags |= F_SIGN; continue;
13454 case ' ': flags |= F_BLANK; continue;
13455 case '#': flags |= F_ALT; continue;
13456 case '0': flags |= F_ZERO; continue;
13457 }
13458 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013459 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 if (c == '*') {
13461 v = getnextarg(args, arglen, &argidx);
13462 if (v == NULL)
13463 goto onError;
13464 if (!PyLong_Check(v)) {
13465 PyErr_SetString(PyExc_TypeError,
13466 "* wants int");
13467 goto onError;
13468 }
13469 width = PyLong_AsLong(v);
13470 if (width == -1 && PyErr_Occurred())
13471 goto onError;
13472 if (width < 0) {
13473 flags |= F_LJUST;
13474 width = -width;
13475 }
13476 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 }
13479 else if (c >= '0' && c <= '9') {
13480 width = c - '0';
13481 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013482 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013483 if (c < '0' || c > '9')
13484 break;
13485 if ((width*10) / 10 != width) {
13486 PyErr_SetString(PyExc_ValueError,
13487 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013488 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 }
13490 width = width*10 + (c - '0');
13491 }
13492 }
13493 if (c == '.') {
13494 prec = 0;
13495 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013497 if (c == '*') {
13498 v = getnextarg(args, arglen, &argidx);
13499 if (v == NULL)
13500 goto onError;
13501 if (!PyLong_Check(v)) {
13502 PyErr_SetString(PyExc_TypeError,
13503 "* wants int");
13504 goto onError;
13505 }
13506 prec = PyLong_AsLong(v);
13507 if (prec == -1 && PyErr_Occurred())
13508 goto onError;
13509 if (prec < 0)
13510 prec = 0;
13511 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013512 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013513 }
13514 else if (c >= '0' && c <= '9') {
13515 prec = c - '0';
13516 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013517 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013518 if (c < '0' || c > '9')
13519 break;
13520 if ((prec*10) / 10 != prec) {
13521 PyErr_SetString(PyExc_ValueError,
13522 "prec too big");
13523 goto onError;
13524 }
13525 prec = prec*10 + (c - '0');
13526 }
13527 }
13528 } /* prec */
13529 if (fmtcnt >= 0) {
13530 if (c == 'h' || c == 'l' || c == 'L') {
13531 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013532 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 }
13534 }
13535 if (fmtcnt < 0) {
13536 PyErr_SetString(PyExc_ValueError,
13537 "incomplete format");
13538 goto onError;
13539 }
13540 if (c != '%') {
13541 v = getnextarg(args, arglen, &argidx);
13542 if (v == NULL)
13543 goto onError;
13544 }
13545 sign = 0;
13546 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013547 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013548 switch (c) {
13549
13550 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013551 _PyAccu_Accumulate(&acc, percent);
13552 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013553
13554 case 's':
13555 case 'r':
13556 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013557 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013558 temp = v;
13559 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013560 }
13561 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 if (c == 's')
13563 temp = PyObject_Str(v);
13564 else if (c == 'r')
13565 temp = PyObject_Repr(v);
13566 else
13567 temp = PyObject_ASCII(v);
13568 if (temp == NULL)
13569 goto onError;
13570 if (PyUnicode_Check(temp))
13571 /* nothing to do */;
13572 else {
13573 Py_DECREF(temp);
13574 PyErr_SetString(PyExc_TypeError,
13575 "%s argument has non-string str()");
13576 goto onError;
13577 }
13578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013579 if (PyUnicode_READY(temp) == -1) {
13580 Py_CLEAR(temp);
13581 goto onError;
13582 }
13583 pbuf = PyUnicode_DATA(temp);
13584 kind = PyUnicode_KIND(temp);
13585 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 if (prec >= 0 && len > prec)
13587 len = prec;
13588 break;
13589
13590 case 'i':
13591 case 'd':
13592 case 'u':
13593 case 'o':
13594 case 'x':
13595 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 isnumok = 0;
13597 if (PyNumber_Check(v)) {
13598 PyObject *iobj=NULL;
13599
13600 if (PyLong_Check(v)) {
13601 iobj = v;
13602 Py_INCREF(iobj);
13603 }
13604 else {
13605 iobj = PyNumber_Long(v);
13606 }
13607 if (iobj!=NULL) {
13608 if (PyLong_Check(iobj)) {
13609 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013610 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 Py_DECREF(iobj);
13612 if (!temp)
13613 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 if (PyUnicode_READY(temp) == -1) {
13615 Py_CLEAR(temp);
13616 goto onError;
13617 }
13618 pbuf = PyUnicode_DATA(temp);
13619 kind = PyUnicode_KIND(temp);
13620 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013621 sign = 1;
13622 }
13623 else {
13624 Py_DECREF(iobj);
13625 }
13626 }
13627 }
13628 if (!isnumok) {
13629 PyErr_Format(PyExc_TypeError,
13630 "%%%c format: a number is required, "
13631 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13632 goto onError;
13633 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013634 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013636 fillobj = zero;
13637 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013638 break;
13639
13640 case 'e':
13641 case 'E':
13642 case 'f':
13643 case 'F':
13644 case 'g':
13645 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013646 temp = formatfloat(v, flags, prec, c);
13647 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013648 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013649 if (PyUnicode_READY(temp) == -1) {
13650 Py_CLEAR(temp);
13651 goto onError;
13652 }
13653 pbuf = PyUnicode_DATA(temp);
13654 kind = PyUnicode_KIND(temp);
13655 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013656 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013657 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013658 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013659 fillobj = zero;
13660 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013661 break;
13662
13663 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013664 {
13665 Py_UCS4 ch = formatchar(v);
13666 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013667 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013668 temp = _PyUnicode_FromUCS4(&ch, 1);
13669 if (temp == NULL)
13670 goto onError;
13671 pbuf = PyUnicode_DATA(temp);
13672 kind = PyUnicode_KIND(temp);
13673 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013674 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013675 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013676
13677 default:
13678 PyErr_Format(PyExc_ValueError,
13679 "unsupported format character '%c' (0x%x) "
13680 "at index %zd",
13681 (31<=c && c<=126) ? (char)c : '?',
13682 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013683 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013684 goto onError;
13685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013686 /* pbuf is initialized here. */
13687 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013689 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13690 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013691 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013692 pindex++;
13693 }
13694 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13695 signobj = plus;
13696 len--;
13697 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013698 }
13699 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013700 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013701 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013702 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013703 else
13704 sign = 0;
13705 }
13706 if (width < len)
13707 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013709 if (fill != ' ') {
13710 assert(signobj != NULL);
13711 if (_PyAccu_Accumulate(&acc, signobj))
13712 goto onError;
13713 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 if (width > len)
13715 width--;
13716 }
13717 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013718 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013719 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013720 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013721 second = get_latin1_char(
13722 PyUnicode_READ(kind, pbuf, pindex + 1));
13723 pindex += 2;
13724 if (second == NULL ||
13725 _PyAccu_Accumulate(&acc, zero) ||
13726 _PyAccu_Accumulate(&acc, second))
13727 goto onError;
13728 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 width -= 2;
13731 if (width < 0)
13732 width = 0;
13733 len -= 2;
13734 }
13735 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013736 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013737 if (repeat_accumulate(&acc, fillobj, width - len))
13738 goto onError;
13739 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 }
13741 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013742 if (sign) {
13743 assert(signobj != NULL);
13744 if (_PyAccu_Accumulate(&acc, signobj))
13745 goto onError;
13746 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013747 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013748 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13749 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013750 second = get_latin1_char(
13751 PyUnicode_READ(kind, pbuf, pindex + 1));
13752 pindex += 2;
13753 if (second == NULL ||
13754 _PyAccu_Accumulate(&acc, zero) ||
13755 _PyAccu_Accumulate(&acc, second))
13756 goto onError;
13757 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013758 }
13759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013760 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013761 if (temp != NULL) {
13762 assert(pbuf == PyUnicode_DATA(temp));
13763 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013764 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013765 else {
13766 const char *p = (const char *) pbuf;
13767 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013768 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013769 v = PyUnicode_FromKindAndData(kind, p, len);
13770 }
13771 if (v == NULL)
13772 goto onError;
13773 r = _PyAccu_Accumulate(&acc, v);
13774 Py_DECREF(v);
13775 if (r)
13776 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013777 if (width > len && repeat_accumulate(&acc, blank, width - len))
13778 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013779 if (dict && (argidx < arglen) && c != '%') {
13780 PyErr_SetString(PyExc_TypeError,
13781 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013782 goto onError;
13783 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013784 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013785 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013786 } /* until end */
13787 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013788 PyErr_SetString(PyExc_TypeError,
13789 "not all arguments converted during string formatting");
13790 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013791 }
13792
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013793 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013794 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013796 }
13797 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013798 Py_XDECREF(temp);
13799 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013800 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013801
Benjamin Peterson29060642009-01-31 22:14:21 +000013802 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013803 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013804 Py_XDECREF(temp);
13805 Py_XDECREF(second);
13806 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013809 }
13810 return NULL;
13811}
13812
Jeremy Hylton938ace62002-07-17 16:30:39 +000013813static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013814unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13815
Tim Peters6d6c1a32001-08-02 04:15:00 +000013816static PyObject *
13817unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13818{
Benjamin Peterson29060642009-01-31 22:14:21 +000013819 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013820 static char *kwlist[] = {"object", "encoding", "errors", 0};
13821 char *encoding = NULL;
13822 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013823
Benjamin Peterson14339b62009-01-31 16:36:08 +000013824 if (type != &PyUnicode_Type)
13825 return unicode_subtype_new(type, args, kwds);
13826 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013827 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013828 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013829 if (x == NULL) {
13830 Py_INCREF(unicode_empty);
13831 return unicode_empty;
13832 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013833 if (encoding == NULL && errors == NULL)
13834 return PyObject_Str(x);
13835 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013836 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013837}
13838
Guido van Rossume023fe02001-08-30 03:12:59 +000013839static PyObject *
13840unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13841{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013842 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013843 Py_ssize_t length, char_size;
13844 int share_wstr, share_utf8;
13845 unsigned int kind;
13846 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013847
Benjamin Peterson14339b62009-01-31 16:36:08 +000013848 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013849
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013850 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013851 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013852 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013853 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013854 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013855 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013856 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013857 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013858
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013859 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013860 if (self == NULL) {
13861 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013862 return NULL;
13863 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013864 kind = PyUnicode_KIND(unicode);
13865 length = PyUnicode_GET_LENGTH(unicode);
13866
13867 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013868#ifdef Py_DEBUG
13869 _PyUnicode_HASH(self) = -1;
13870#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013871 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013872#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013873 _PyUnicode_STATE(self).interned = 0;
13874 _PyUnicode_STATE(self).kind = kind;
13875 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013876 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013877 _PyUnicode_STATE(self).ready = 1;
13878 _PyUnicode_WSTR(self) = NULL;
13879 _PyUnicode_UTF8_LENGTH(self) = 0;
13880 _PyUnicode_UTF8(self) = NULL;
13881 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013882 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013883
13884 share_utf8 = 0;
13885 share_wstr = 0;
13886 if (kind == PyUnicode_1BYTE_KIND) {
13887 char_size = 1;
13888 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13889 share_utf8 = 1;
13890 }
13891 else if (kind == PyUnicode_2BYTE_KIND) {
13892 char_size = 2;
13893 if (sizeof(wchar_t) == 2)
13894 share_wstr = 1;
13895 }
13896 else {
13897 assert(kind == PyUnicode_4BYTE_KIND);
13898 char_size = 4;
13899 if (sizeof(wchar_t) == 4)
13900 share_wstr = 1;
13901 }
13902
13903 /* Ensure we won't overflow the length. */
13904 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13905 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013906 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013907 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013908 data = PyObject_MALLOC((length + 1) * char_size);
13909 if (data == NULL) {
13910 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013911 goto onError;
13912 }
13913
Victor Stinnerc3c74152011-10-02 20:39:55 +020013914 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013915 if (share_utf8) {
13916 _PyUnicode_UTF8_LENGTH(self) = length;
13917 _PyUnicode_UTF8(self) = data;
13918 }
13919 if (share_wstr) {
13920 _PyUnicode_WSTR_LENGTH(self) = length;
13921 _PyUnicode_WSTR(self) = (wchar_t *)data;
13922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013923
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013924 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013925 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013926 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013927#ifdef Py_DEBUG
13928 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13929#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013930 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013931 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013932
13933onError:
13934 Py_DECREF(unicode);
13935 Py_DECREF(self);
13936 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013937}
13938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013939PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013941\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013942Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013943encoding defaults to the current default string encoding.\n\
13944errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013945
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013946static PyObject *unicode_iter(PyObject *seq);
13947
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013949 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 "str", /* tp_name */
13951 sizeof(PyUnicodeObject), /* tp_size */
13952 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013953 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013954 (destructor)unicode_dealloc, /* tp_dealloc */
13955 0, /* tp_print */
13956 0, /* tp_getattr */
13957 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013958 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013959 unicode_repr, /* tp_repr */
13960 &unicode_as_number, /* tp_as_number */
13961 &unicode_as_sequence, /* tp_as_sequence */
13962 &unicode_as_mapping, /* tp_as_mapping */
13963 (hashfunc) unicode_hash, /* tp_hash*/
13964 0, /* tp_call*/
13965 (reprfunc) unicode_str, /* tp_str */
13966 PyObject_GenericGetAttr, /* tp_getattro */
13967 0, /* tp_setattro */
13968 0, /* tp_as_buffer */
13969 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013970 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 unicode_doc, /* tp_doc */
13972 0, /* tp_traverse */
13973 0, /* tp_clear */
13974 PyUnicode_RichCompare, /* tp_richcompare */
13975 0, /* tp_weaklistoffset */
13976 unicode_iter, /* tp_iter */
13977 0, /* tp_iternext */
13978 unicode_methods, /* tp_methods */
13979 0, /* tp_members */
13980 0, /* tp_getset */
13981 &PyBaseObject_Type, /* tp_base */
13982 0, /* tp_dict */
13983 0, /* tp_descr_get */
13984 0, /* tp_descr_set */
13985 0, /* tp_dictoffset */
13986 0, /* tp_init */
13987 0, /* tp_alloc */
13988 unicode_new, /* tp_new */
13989 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013990};
13991
13992/* Initialize the Unicode implementation */
13993
Victor Stinner3a50e702011-10-18 21:21:00 +020013994int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013996 int i;
13997
Thomas Wouters477c8d52006-05-27 19:21:47 +000013998 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013999 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014000 0x000A, /* LINE FEED */
14001 0x000D, /* CARRIAGE RETURN */
14002 0x001C, /* FILE SEPARATOR */
14003 0x001D, /* GROUP SEPARATOR */
14004 0x001E, /* RECORD SEPARATOR */
14005 0x0085, /* NEXT LINE */
14006 0x2028, /* LINE SEPARATOR */
14007 0x2029, /* PARAGRAPH SEPARATOR */
14008 };
14009
Fred Drakee4315f52000-05-09 19:53:39 +000014010 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014011 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014012 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014013 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014014 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014015
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014016 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014017 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014018 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014019 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014020
14021 /* initialize the linebreak bloom filter */
14022 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014023 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014024 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014025
14026 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014027
14028#ifdef HAVE_MBCS
14029 winver.dwOSVersionInfoSize = sizeof(winver);
14030 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14031 PyErr_SetFromWindowsErr(0);
14032 return -1;
14033 }
14034#endif
14035 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014036}
14037
14038/* Finalize the Unicode implementation */
14039
Christian Heimesa156e092008-02-16 07:38:31 +000014040int
14041PyUnicode_ClearFreeList(void)
14042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014043 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014044}
14045
Guido van Rossumd57fd912000-03-10 22:53:23 +000014046void
Thomas Wouters78890102000-07-22 19:25:51 +000014047_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014048{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014049 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014050
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014051 Py_XDECREF(unicode_empty);
14052 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014053
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014054 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014055 if (unicode_latin1[i]) {
14056 Py_DECREF(unicode_latin1[i]);
14057 unicode_latin1[i] = NULL;
14058 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014059 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014060 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014061 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014062}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014063
Walter Dörwald16807132007-05-25 13:52:07 +000014064void
14065PyUnicode_InternInPlace(PyObject **p)
14066{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014067 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014068 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014069#ifdef Py_DEBUG
14070 assert(s != NULL);
14071 assert(_PyUnicode_CHECK(s));
14072#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014073 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014074 return;
14075#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014076 /* If it's a subclass, we don't really know what putting
14077 it in the interned dict might do. */
14078 if (!PyUnicode_CheckExact(s))
14079 return;
14080 if (PyUnicode_CHECK_INTERNED(s))
14081 return;
14082 if (interned == NULL) {
14083 interned = PyDict_New();
14084 if (interned == NULL) {
14085 PyErr_Clear(); /* Don't leave an exception */
14086 return;
14087 }
14088 }
14089 /* It might be that the GetItem call fails even
14090 though the key is present in the dictionary,
14091 namely when this happens during a stack overflow. */
14092 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014093 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014094 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014095
Benjamin Peterson29060642009-01-31 22:14:21 +000014096 if (t) {
14097 Py_INCREF(t);
14098 Py_DECREF(*p);
14099 *p = t;
14100 return;
14101 }
Walter Dörwald16807132007-05-25 13:52:07 +000014102
Benjamin Peterson14339b62009-01-31 16:36:08 +000014103 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014104 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014105 PyErr_Clear();
14106 PyThreadState_GET()->recursion_critical = 0;
14107 return;
14108 }
14109 PyThreadState_GET()->recursion_critical = 0;
14110 /* The two references in interned are not counted by refcnt.
14111 The deallocator will take care of this */
14112 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014113 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014114}
14115
14116void
14117PyUnicode_InternImmortal(PyObject **p)
14118{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014119 PyUnicode_InternInPlace(p);
14120 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014121 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014122 Py_INCREF(*p);
14123 }
Walter Dörwald16807132007-05-25 13:52:07 +000014124}
14125
14126PyObject *
14127PyUnicode_InternFromString(const char *cp)
14128{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014129 PyObject *s = PyUnicode_FromString(cp);
14130 if (s == NULL)
14131 return NULL;
14132 PyUnicode_InternInPlace(&s);
14133 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014134}
14135
Alexander Belopolsky40018472011-02-26 01:02:56 +000014136void
14137_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014138{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014139 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014140 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014141 Py_ssize_t i, n;
14142 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014143
Benjamin Peterson14339b62009-01-31 16:36:08 +000014144 if (interned == NULL || !PyDict_Check(interned))
14145 return;
14146 keys = PyDict_Keys(interned);
14147 if (keys == NULL || !PyList_Check(keys)) {
14148 PyErr_Clear();
14149 return;
14150 }
Walter Dörwald16807132007-05-25 13:52:07 +000014151
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14153 detector, interned unicode strings are not forcibly deallocated;
14154 rather, we give them their stolen references back, and then clear
14155 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014156
Benjamin Peterson14339b62009-01-31 16:36:08 +000014157 n = PyList_GET_SIZE(keys);
14158 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014159 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014160 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014161 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014162 if (PyUnicode_READY(s) == -1) {
14163 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014164 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014166 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 case SSTATE_NOT_INTERNED:
14168 /* XXX Shouldn't happen */
14169 break;
14170 case SSTATE_INTERNED_IMMORTAL:
14171 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014172 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014173 break;
14174 case SSTATE_INTERNED_MORTAL:
14175 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014176 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 break;
14178 default:
14179 Py_FatalError("Inconsistent interned string state.");
14180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014181 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014182 }
14183 fprintf(stderr, "total size of all interned strings: "
14184 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14185 "mortal/immortal\n", mortal_size, immortal_size);
14186 Py_DECREF(keys);
14187 PyDict_Clear(interned);
14188 Py_DECREF(interned);
14189 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014190}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014191
14192
14193/********************* Unicode Iterator **************************/
14194
14195typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014196 PyObject_HEAD
14197 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014198 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014199} unicodeiterobject;
14200
14201static void
14202unicodeiter_dealloc(unicodeiterobject *it)
14203{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014204 _PyObject_GC_UNTRACK(it);
14205 Py_XDECREF(it->it_seq);
14206 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014207}
14208
14209static int
14210unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14211{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014212 Py_VISIT(it->it_seq);
14213 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014214}
14215
14216static PyObject *
14217unicodeiter_next(unicodeiterobject *it)
14218{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014219 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014220
Benjamin Peterson14339b62009-01-31 16:36:08 +000014221 assert(it != NULL);
14222 seq = it->it_seq;
14223 if (seq == NULL)
14224 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014225 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014227 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14228 int kind = PyUnicode_KIND(seq);
14229 void *data = PyUnicode_DATA(seq);
14230 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14231 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014232 if (item != NULL)
14233 ++it->it_index;
14234 return item;
14235 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014236
Benjamin Peterson14339b62009-01-31 16:36:08 +000014237 Py_DECREF(seq);
14238 it->it_seq = NULL;
14239 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014240}
14241
14242static PyObject *
14243unicodeiter_len(unicodeiterobject *it)
14244{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014245 Py_ssize_t len = 0;
14246 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014247 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014248 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014249}
14250
14251PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14252
14253static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014254 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014255 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014256 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014257};
14258
14259PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014260 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14261 "str_iterator", /* tp_name */
14262 sizeof(unicodeiterobject), /* tp_basicsize */
14263 0, /* tp_itemsize */
14264 /* methods */
14265 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14266 0, /* tp_print */
14267 0, /* tp_getattr */
14268 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014269 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014270 0, /* tp_repr */
14271 0, /* tp_as_number */
14272 0, /* tp_as_sequence */
14273 0, /* tp_as_mapping */
14274 0, /* tp_hash */
14275 0, /* tp_call */
14276 0, /* tp_str */
14277 PyObject_GenericGetAttr, /* tp_getattro */
14278 0, /* tp_setattro */
14279 0, /* tp_as_buffer */
14280 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14281 0, /* tp_doc */
14282 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14283 0, /* tp_clear */
14284 0, /* tp_richcompare */
14285 0, /* tp_weaklistoffset */
14286 PyObject_SelfIter, /* tp_iter */
14287 (iternextfunc)unicodeiter_next, /* tp_iternext */
14288 unicodeiter_methods, /* tp_methods */
14289 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014290};
14291
14292static PyObject *
14293unicode_iter(PyObject *seq)
14294{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014295 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014296
Benjamin Peterson14339b62009-01-31 16:36:08 +000014297 if (!PyUnicode_Check(seq)) {
14298 PyErr_BadInternalCall();
14299 return NULL;
14300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014301 if (PyUnicode_READY(seq) == -1)
14302 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14304 if (it == NULL)
14305 return NULL;
14306 it->it_index = 0;
14307 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014308 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014309 _PyObject_GC_TRACK(it);
14310 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014311}
14312
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014313
14314size_t
14315Py_UNICODE_strlen(const Py_UNICODE *u)
14316{
14317 int res = 0;
14318 while(*u++)
14319 res++;
14320 return res;
14321}
14322
14323Py_UNICODE*
14324Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14325{
14326 Py_UNICODE *u = s1;
14327 while ((*u++ = *s2++));
14328 return s1;
14329}
14330
14331Py_UNICODE*
14332Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14333{
14334 Py_UNICODE *u = s1;
14335 while ((*u++ = *s2++))
14336 if (n-- == 0)
14337 break;
14338 return s1;
14339}
14340
14341Py_UNICODE*
14342Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14343{
14344 Py_UNICODE *u1 = s1;
14345 u1 += Py_UNICODE_strlen(u1);
14346 Py_UNICODE_strcpy(u1, s2);
14347 return s1;
14348}
14349
14350int
14351Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14352{
14353 while (*s1 && *s2 && *s1 == *s2)
14354 s1++, s2++;
14355 if (*s1 && *s2)
14356 return (*s1 < *s2) ? -1 : +1;
14357 if (*s1)
14358 return 1;
14359 if (*s2)
14360 return -1;
14361 return 0;
14362}
14363
14364int
14365Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14366{
14367 register Py_UNICODE u1, u2;
14368 for (; n != 0; n--) {
14369 u1 = *s1;
14370 u2 = *s2;
14371 if (u1 != u2)
14372 return (u1 < u2) ? -1 : +1;
14373 if (u1 == '\0')
14374 return 0;
14375 s1++;
14376 s2++;
14377 }
14378 return 0;
14379}
14380
14381Py_UNICODE*
14382Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14383{
14384 const Py_UNICODE *p;
14385 for (p = s; *p; p++)
14386 if (*p == c)
14387 return (Py_UNICODE*)p;
14388 return NULL;
14389}
14390
14391Py_UNICODE*
14392Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14393{
14394 const Py_UNICODE *p;
14395 p = s + Py_UNICODE_strlen(s);
14396 while (p != s) {
14397 p--;
14398 if (*p == c)
14399 return (Py_UNICODE*)p;
14400 }
14401 return NULL;
14402}
Victor Stinner331ea922010-08-10 16:37:20 +000014403
Victor Stinner71133ff2010-09-01 23:43:53 +000014404Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014405PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014406{
Victor Stinner577db2c2011-10-11 22:12:48 +020014407 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014408 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014410 if (!PyUnicode_Check(unicode)) {
14411 PyErr_BadArgument();
14412 return NULL;
14413 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014414 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014415 if (u == NULL)
14416 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014417 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014418 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014419 PyErr_NoMemory();
14420 return NULL;
14421 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014422 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014423 size *= sizeof(Py_UNICODE);
14424 copy = PyMem_Malloc(size);
14425 if (copy == NULL) {
14426 PyErr_NoMemory();
14427 return NULL;
14428 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014429 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014430 return copy;
14431}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014432
Georg Brandl66c221e2010-10-14 07:04:07 +000014433/* A _string module, to export formatter_parser and formatter_field_name_split
14434 to the string.Formatter class implemented in Python. */
14435
14436static PyMethodDef _string_methods[] = {
14437 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14438 METH_O, PyDoc_STR("split the argument as a field name")},
14439 {"formatter_parser", (PyCFunction) formatter_parser,
14440 METH_O, PyDoc_STR("parse the argument as a format string")},
14441 {NULL, NULL}
14442};
14443
14444static struct PyModuleDef _string_module = {
14445 PyModuleDef_HEAD_INIT,
14446 "_string",
14447 PyDoc_STR("string helper module"),
14448 0,
14449 _string_methods,
14450 NULL,
14451 NULL,
14452 NULL,
14453 NULL
14454};
14455
14456PyMODINIT_FUNC
14457PyInit__string(void)
14458{
14459 return PyModule_Create(&_string_module);
14460}
14461
14462
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014463#ifdef __cplusplus
14464}
14465#endif