blob: 7efa939f0f0a00530ad4357a85f85fbacffa2e64 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001001 if (maxchar > MAX_UNICODE) {
1002 PyErr_SetString(PyExc_SystemError,
1003 "invalid maximum character passed to PyUnicode_New");
1004 return NULL;
1005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 kind_state = PyUnicode_4BYTE_KIND;
1007 char_size = 4;
1008 if (sizeof(wchar_t) == 4)
1009 is_sharing = 1;
1010 }
1011
1012 /* Ensure we won't overflow the size. */
1013 if (size < 0) {
1014 PyErr_SetString(PyExc_SystemError,
1015 "Negative size passed to PyUnicode_New");
1016 return NULL;
1017 }
1018 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1019 return PyErr_NoMemory();
1020
1021 /* Duplicated allocation code from _PyObject_New() instead of a call to
1022 * PyObject_New() so we are able to allocate space for the object and
1023 * it's data buffer.
1024 */
1025 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1026 if (obj == NULL)
1027 return PyErr_NoMemory();
1028 obj = PyObject_INIT(obj, &PyUnicode_Type);
1029 if (obj == NULL)
1030 return NULL;
1031
1032 unicode = (PyCompactUnicodeObject *)obj;
1033 if (is_ascii)
1034 data = ((PyASCIIObject*)obj) + 1;
1035 else
1036 data = unicode + 1;
1037 _PyUnicode_LENGTH(unicode) = size;
1038 _PyUnicode_HASH(unicode) = -1;
1039 _PyUnicode_STATE(unicode).interned = 0;
1040 _PyUnicode_STATE(unicode).kind = kind_state;
1041 _PyUnicode_STATE(unicode).compact = 1;
1042 _PyUnicode_STATE(unicode).ready = 1;
1043 _PyUnicode_STATE(unicode).ascii = is_ascii;
1044 if (is_ascii) {
1045 ((char*)data)[size] = 0;
1046 _PyUnicode_WSTR(unicode) = NULL;
1047 }
1048 else if (kind_state == PyUnicode_1BYTE_KIND) {
1049 ((char*)data)[size] = 0;
1050 _PyUnicode_WSTR(unicode) = NULL;
1051 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001053 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 }
1055 else {
1056 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001057 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 if (kind_state == PyUnicode_2BYTE_KIND)
1059 ((Py_UCS2*)data)[size] = 0;
1060 else /* kind_state == PyUnicode_4BYTE_KIND */
1061 ((Py_UCS4*)data)[size] = 0;
1062 if (is_sharing) {
1063 _PyUnicode_WSTR_LENGTH(unicode) = size;
1064 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1065 }
1066 else {
1067 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1068 _PyUnicode_WSTR(unicode) = NULL;
1069 }
1070 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001071 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 return obj;
1073}
1074
1075#if SIZEOF_WCHAR_T == 2
1076/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1077 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001078 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079
1080 This function assumes that unicode can hold one more code point than wstr
1081 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001082static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001084 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085{
1086 const wchar_t *iter;
1087 Py_UCS4 *ucs4_out;
1088
Victor Stinner910337b2011-10-03 03:20:16 +02001089 assert(unicode != NULL);
1090 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1092 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1093
1094 for (iter = begin; iter < end; ) {
1095 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1096 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001097 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1098 && (iter+1) < end
1099 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100 {
Victor Stinner551ac952011-11-29 22:58:13 +01001101 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 iter += 2;
1103 }
1104 else {
1105 *ucs4_out++ = *iter;
1106 iter++;
1107 }
1108 }
1109 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1110 _PyUnicode_GET_LENGTH(unicode)));
1111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112}
1113#endif
1114
Victor Stinnercd9950f2011-10-02 00:34:53 +02001115static int
Victor Stinner488fa492011-12-12 00:01:39 +01001116unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001117{
Victor Stinner488fa492011-12-12 00:01:39 +01001118 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001119 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001120 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001121 return -1;
1122 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001123 return 0;
1124}
1125
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001126static int
1127_copy_characters(PyObject *to, Py_ssize_t to_start,
1128 PyObject *from, Py_ssize_t from_start,
1129 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001131 unsigned int from_kind, to_kind;
1132 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001133 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_Check(from));
1136 assert(PyUnicode_Check(to));
1137 assert(PyUnicode_IS_READY(from));
1138 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001140 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1141 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1142 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001144 if (how_many == 0)
1145 return 0;
1146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001148 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001150 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152#ifdef Py_DEBUG
1153 if (!check_maxchar
1154 && (from_kind > to_kind
1155 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001157 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1158 Py_UCS4 ch;
1159 Py_ssize_t i;
1160 for (i=0; i < how_many; i++) {
1161 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1162 assert(ch <= to_maxchar);
1163 }
1164 }
1165#endif
1166 fast = (from_kind == to_kind);
1167 if (check_maxchar
1168 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1169 {
1170 /* deny latin1 => ascii */
1171 fast = 0;
1172 }
1173
1174 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001175 Py_MEMCPY((char*)to_data + to_kind * to_start,
1176 (char*)from_data + from_kind * from_start,
1177 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001179 else if (from_kind == PyUnicode_1BYTE_KIND
1180 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001181 {
1182 _PyUnicode_CONVERT_BYTES(
1183 Py_UCS1, Py_UCS2,
1184 PyUnicode_1BYTE_DATA(from) + from_start,
1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186 PyUnicode_2BYTE_DATA(to) + to_start
1187 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001188 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001189 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001190 && to_kind == PyUnicode_4BYTE_KIND)
1191 {
1192 _PyUnicode_CONVERT_BYTES(
1193 Py_UCS1, Py_UCS4,
1194 PyUnicode_1BYTE_DATA(from) + from_start,
1195 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1196 PyUnicode_4BYTE_DATA(to) + to_start
1197 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001198 }
1199 else if (from_kind == PyUnicode_2BYTE_KIND
1200 && to_kind == PyUnicode_4BYTE_KIND)
1201 {
1202 _PyUnicode_CONVERT_BYTES(
1203 Py_UCS2, Py_UCS4,
1204 PyUnicode_2BYTE_DATA(from) + from_start,
1205 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1206 PyUnicode_4BYTE_DATA(to) + to_start
1207 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001208 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001210 /* check if max_char(from substring) <= max_char(to) */
1211 if (from_kind > to_kind
1212 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001213 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001215 /* slow path to check for character overflow */
1216 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001217 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 Py_ssize_t i;
1219
Victor Stinner56c161a2011-10-06 02:47:11 +02001220#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001221 for (i=0; i < how_many; i++) {
1222 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001223 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001224 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1225 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001226#else
1227 if (!check_maxchar) {
1228 for (i=0; i < how_many; i++) {
1229 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1230 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1231 }
1232 }
1233 else {
1234 for (i=0; i < how_many; i++) {
1235 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1236 if (ch > to_maxchar)
1237 return 1;
1238 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1239 }
1240 }
1241#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001244 assert(0 && "inconsistent state");
1245 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001246 }
1247 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001248 return 0;
1249}
1250
1251static void
1252copy_characters(PyObject *to, Py_ssize_t to_start,
1253 PyObject *from, Py_ssize_t from_start,
1254 Py_ssize_t how_many)
1255{
1256 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1257}
1258
1259Py_ssize_t
1260PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1261 PyObject *from, Py_ssize_t from_start,
1262 Py_ssize_t how_many)
1263{
1264 int err;
1265
1266 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1267 PyErr_BadInternalCall();
1268 return -1;
1269 }
1270
Benjamin Petersonbac79492012-01-14 13:34:47 -05001271 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001272 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001273 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001274 return -1;
1275
1276 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1277 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1278 PyErr_Format(PyExc_SystemError,
1279 "Cannot write %zi characters at %zi "
1280 "in a string of %zi characters",
1281 how_many, to_start, PyUnicode_GET_LENGTH(to));
1282 return -1;
1283 }
1284
1285 if (how_many == 0)
1286 return 0;
1287
Victor Stinner488fa492011-12-12 00:01:39 +01001288 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001289 return -1;
1290
1291 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1292 if (err) {
1293 PyErr_Format(PyExc_SystemError,
1294 "Cannot copy %s characters "
1295 "into a string of %s characters",
1296 unicode_kind_name(from),
1297 unicode_kind_name(to));
1298 return -1;
1299 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001300 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301}
1302
Victor Stinner17222162011-09-28 22:15:37 +02001303/* Find the maximum code point and count the number of surrogate pairs so a
1304 correct string length can be computed before converting a string to UCS4.
1305 This function counts single surrogates as a character and not as a pair.
1306
1307 Return 0 on success, or -1 on error. */
1308static int
1309find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1310 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311{
1312 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001313 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314
Victor Stinnerc53be962011-10-02 21:33:54 +02001315 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 *num_surrogates = 0;
1317 *maxchar = 0;
1318
1319 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001321 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1322 && (iter+1) < end
1323 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001325 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 iter += 2;
1328 }
1329 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001331 {
1332 ch = *iter;
1333 iter++;
1334 }
1335 if (ch > *maxchar) {
1336 *maxchar = ch;
1337 if (*maxchar > MAX_UNICODE) {
1338 PyErr_Format(PyExc_ValueError,
1339 "character U+%x is not in range [U+0000; U+10ffff]",
1340 ch);
1341 return -1;
1342 }
1343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 }
1345 return 0;
1346}
1347
1348#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001349static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350#endif
1351
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001352int
1353_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 wchar_t *end;
1356 Py_UCS4 maxchar = 0;
1357 Py_ssize_t num_surrogates;
1358#if SIZEOF_WCHAR_T == 2
1359 Py_ssize_t length_wo_surrogates;
1360#endif
1361
Georg Brandl7597add2011-10-05 16:36:47 +02001362 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001363 strings were created using _PyObject_New() and where no canonical
1364 representation (the str field) has been set yet aka strings
1365 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001366 assert(_PyUnicode_CHECK(unicode));
1367 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001370 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001371 /* Actually, it should neither be interned nor be anything else: */
1372 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373
1374#ifdef Py_DEBUG
1375 ++unicode_ready_calls;
1376#endif
1377
1378 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001379 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001380 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382
1383 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001384 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1385 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 PyErr_NoMemory();
1387 return -1;
1388 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001389 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 _PyUnicode_WSTR(unicode), end,
1391 PyUnicode_1BYTE_DATA(unicode));
1392 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1393 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1394 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1395 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001397 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001398 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001401 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001402 _PyUnicode_UTF8(unicode) = NULL;
1403 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 }
1405 PyObject_FREE(_PyUnicode_WSTR(unicode));
1406 _PyUnicode_WSTR(unicode) = NULL;
1407 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1408 }
1409 /* In this case we might have to convert down from 4-byte native
1410 wchar_t to 2-byte unicode. */
1411 else if (maxchar < 65536) {
1412 assert(num_surrogates == 0 &&
1413 "FindMaxCharAndNumSurrogatePairs() messed up");
1414
Victor Stinner506f5922011-09-28 22:34:18 +02001415#if SIZEOF_WCHAR_T == 2
1416 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001418 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1419 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1420 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001421 _PyUnicode_UTF8(unicode) = NULL;
1422 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001423#else
1424 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001425 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001426 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001428 PyErr_NoMemory();
1429 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 }
Victor Stinner506f5922011-09-28 22:34:18 +02001431 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1432 _PyUnicode_WSTR(unicode), end,
1433 PyUnicode_2BYTE_DATA(unicode));
1434 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1435 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1436 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001439 PyObject_FREE(_PyUnicode_WSTR(unicode));
1440 _PyUnicode_WSTR(unicode) = NULL;
1441 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1442#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 }
1444 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1445 else {
1446#if SIZEOF_WCHAR_T == 2
1447 /* in case the native representation is 2-bytes, we need to allocate a
1448 new normalized 4-byte version. */
1449 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001450 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1451 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 PyErr_NoMemory();
1453 return -1;
1454 }
1455 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1456 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001457 _PyUnicode_UTF8(unicode) = NULL;
1458 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001459 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1460 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001461 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 PyObject_FREE(_PyUnicode_WSTR(unicode));
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1465#else
1466 assert(num_surrogates == 0);
1467
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 _PyUnicode_UTF8(unicode) = NULL;
1471 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1473#endif
1474 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1475 }
1476 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001477 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 return 0;
1479}
1480
Alexander Belopolsky40018472011-02-26 01:02:56 +00001481static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001482unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483{
Walter Dörwald16807132007-05-25 13:52:07 +00001484 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 case SSTATE_NOT_INTERNED:
1486 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001487
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 case SSTATE_INTERNED_MORTAL:
1489 /* revive dead object temporarily for DelItem */
1490 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001491 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001492 Py_FatalError(
1493 "deletion of interned string failed");
1494 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001495
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 case SSTATE_INTERNED_IMMORTAL:
1497 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001498
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 default:
1500 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001501 }
1502
Victor Stinner03490912011-10-03 23:45:12 +02001503 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001505 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001506 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001507 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1508 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001509
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001510 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511}
1512
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513#ifdef Py_DEBUG
1514static int
1515unicode_is_singleton(PyObject *unicode)
1516{
1517 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1518 if (unicode == unicode_empty)
1519 return 1;
1520 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1521 {
1522 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1523 if (ch < 256 && unicode_latin1[ch] == unicode)
1524 return 1;
1525 }
1526 return 0;
1527}
1528#endif
1529
Alexander Belopolsky40018472011-02-26 01:02:56 +00001530static int
Victor Stinner488fa492011-12-12 00:01:39 +01001531unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001532{
Victor Stinner488fa492011-12-12 00:01:39 +01001533 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001534 if (Py_REFCNT(unicode) != 1)
1535 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001536 if (_PyUnicode_HASH(unicode) != -1)
1537 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001538 if (PyUnicode_CHECK_INTERNED(unicode))
1539 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001540 if (!PyUnicode_CheckExact(unicode))
1541 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001542#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001543 /* singleton refcount is greater than 1 */
1544 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001545#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001546 return 1;
1547}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001548
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549static int
1550unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1551{
1552 PyObject *unicode;
1553 Py_ssize_t old_length;
1554
1555 assert(p_unicode != NULL);
1556 unicode = *p_unicode;
1557
1558 assert(unicode != NULL);
1559 assert(PyUnicode_Check(unicode));
1560 assert(0 <= length);
1561
Victor Stinner910337b2011-10-03 03:20:16 +02001562 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001563 old_length = PyUnicode_WSTR_LENGTH(unicode);
1564 else
1565 old_length = PyUnicode_GET_LENGTH(unicode);
1566 if (old_length == length)
1567 return 0;
1568
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001569 if (length == 0) {
1570 Py_DECREF(*p_unicode);
1571 *p_unicode = unicode_empty;
1572 Py_INCREF(*p_unicode);
1573 return 0;
1574 }
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001577 PyObject *copy = resize_copy(unicode, length);
1578 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001579 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 Py_DECREF(*p_unicode);
1581 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583 }
1584
Victor Stinnerfe226c02011-10-03 03:52:20 +02001585 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001586 PyObject *new_unicode = resize_compact(unicode, length);
1587 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001589 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001590 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001591 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001592 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001593 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001594}
1595
Alexander Belopolsky40018472011-02-26 01:02:56 +00001596int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001597PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001598{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 PyObject *unicode;
1600 if (p_unicode == NULL) {
1601 PyErr_BadInternalCall();
1602 return -1;
1603 }
1604 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001605 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 {
1607 PyErr_BadInternalCall();
1608 return -1;
1609 }
1610 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001611}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001612
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001613static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001614unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001615{
1616 PyObject *result;
1617 assert(PyUnicode_IS_READY(*p_unicode));
1618 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1619 return 0;
1620 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1621 maxchar);
1622 if (result == NULL)
1623 return -1;
1624 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1625 PyUnicode_GET_LENGTH(*p_unicode));
1626 Py_DECREF(*p_unicode);
1627 *p_unicode = result;
1628 return 0;
1629}
1630
1631static int
1632unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1633 Py_UCS4 ch)
1634{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001635 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001636 if (unicode_widen(p_unicode, ch) < 0)
1637 return -1;
1638 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1639 PyUnicode_DATA(*p_unicode),
1640 (*pos)++, ch);
1641 return 0;
1642}
1643
Victor Stinnerc5166102012-02-22 13:55:02 +01001644/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1645 Return the length of the input string.
1646
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001647 WARNING: The function doesn't copy the terminating null character and
1648 doesn't check the maximum character (may write a latin1 character in an
1649 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001650static Py_ssize_t
1651unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1652{
1653 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1654 void *data = PyUnicode_DATA(unicode);
1655
1656 switch (kind) {
1657 case PyUnicode_1BYTE_KIND: {
1658 Py_ssize_t len = strlen(str);
1659 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001660 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001661 return len;
1662 }
1663 case PyUnicode_2BYTE_KIND: {
1664 Py_UCS2 *start = (Py_UCS2 *)data + index;
1665 Py_UCS2 *ucs2 = start;
1666 assert(index <= PyUnicode_GET_LENGTH(unicode));
1667
1668 for (; *str; ++ucs2, ++str)
1669 *ucs2 = (Py_UCS2)*str;
1670
1671 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1672 return ucs2 - start;
1673 }
1674 default: {
1675 Py_UCS4 *start = (Py_UCS4 *)data + index;
1676 Py_UCS4 *ucs4 = start;
1677 assert(kind == PyUnicode_4BYTE_KIND);
1678 assert(index <= PyUnicode_GET_LENGTH(unicode));
1679
1680 for (; *str; ++ucs4, ++str)
1681 *ucs4 = (Py_UCS4)*str;
1682
1683 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1684 return ucs4 - start;
1685 }
1686 }
1687}
1688
1689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690static PyObject*
1691get_latin1_char(unsigned char ch)
1692{
Victor Stinnera464fc12011-10-02 20:39:30 +02001693 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001695 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696 if (!unicode)
1697 return NULL;
1698 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001699 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 unicode_latin1[ch] = unicode;
1701 }
1702 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001703 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704}
1705
Alexander Belopolsky40018472011-02-26 01:02:56 +00001706PyObject *
1707PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001709 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 Py_UCS4 maxchar = 0;
1711 Py_ssize_t num_surrogates;
1712
1713 if (u == NULL)
1714 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001716 /* If the Unicode data is known at construction time, we can apply
1717 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 /* Optimization for empty strings */
1720 if (size == 0 && unicode_empty != NULL) {
1721 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001722 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001723 }
Tim Petersced69f82003-09-16 20:30:58 +00001724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 /* Single character Unicode objects in the Latin-1 range are
1726 shared when using this constructor */
1727 if (size == 1 && *u < 256)
1728 return get_latin1_char((unsigned char)*u);
1729
1730 /* If not empty and not single character, copy the Unicode data
1731 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001732 if (find_maxchar_surrogates(u, u + size,
1733 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 return NULL;
1735
Victor Stinner8faf8212011-12-08 22:14:11 +01001736 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 if (!unicode)
1738 return NULL;
1739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 switch (PyUnicode_KIND(unicode)) {
1741 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001742 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1744 break;
1745 case PyUnicode_2BYTE_KIND:
1746#if Py_UNICODE_SIZE == 2
1747 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1748#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001749 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1751#endif
1752 break;
1753 case PyUnicode_4BYTE_KIND:
1754#if SIZEOF_WCHAR_T == 2
1755 /* This is the only case which has to process surrogates, thus
1756 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001757 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758#else
1759 assert(num_surrogates == 0);
1760 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1761#endif
1762 break;
1763 default:
1764 assert(0 && "Impossible state");
1765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001767 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768}
1769
Alexander Belopolsky40018472011-02-26 01:02:56 +00001770PyObject *
1771PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001773 if (size < 0) {
1774 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001776 return NULL;
1777 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001778 if (u != NULL)
1779 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1780 else
1781 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001782}
1783
Alexander Belopolsky40018472011-02-26 01:02:56 +00001784PyObject *
1785PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001786{
1787 size_t size = strlen(u);
1788 if (size > PY_SSIZE_T_MAX) {
1789 PyErr_SetString(PyExc_OverflowError, "input too long");
1790 return NULL;
1791 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001792 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001793}
1794
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001795PyObject *
1796_PyUnicode_FromId(_Py_Identifier *id)
1797{
1798 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001799 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1800 strlen(id->string),
1801 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001802 if (!id->object)
1803 return NULL;
1804 PyUnicode_InternInPlace(&id->object);
1805 assert(!id->next);
1806 id->next = static_strings;
1807 static_strings = id;
1808 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001809 return id->object;
1810}
1811
1812void
1813_PyUnicode_ClearStaticStrings()
1814{
1815 _Py_Identifier *i;
1816 for (i = static_strings; i; i = i->next) {
1817 Py_DECREF(i->object);
1818 i->object = NULL;
1819 i->next = NULL;
1820 }
1821}
1822
Benjamin Peterson0df54292012-03-26 14:50:32 -04001823/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001824
Victor Stinnere57b1c02011-09-28 22:20:48 +02001825static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001826unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001827{
Victor Stinner785938e2011-12-11 20:09:03 +01001828 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001829 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001830#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001831 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001832#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001833 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001834 }
Victor Stinner785938e2011-12-11 20:09:03 +01001835 unicode = PyUnicode_New(size, 127);
1836 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001837 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001838 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1839 assert(_PyUnicode_CheckConsistency(unicode, 1));
1840 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001841}
1842
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001843static Py_UCS4
1844kind_maxchar_limit(unsigned int kind)
1845{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001846 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001847 case PyUnicode_1BYTE_KIND:
1848 return 0x80;
1849 case PyUnicode_2BYTE_KIND:
1850 return 0x100;
1851 case PyUnicode_4BYTE_KIND:
1852 return 0x10000;
1853 default:
1854 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001855 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001856 }
1857}
1858
Victor Stinner702c7342011-10-05 13:50:52 +02001859static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001860_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001863 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001864
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001865 if (size == 0) {
1866 Py_INCREF(unicode_empty);
1867 return unicode_empty;
1868 }
1869 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001870 if (size == 1)
1871 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001872
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001873 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001874 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 if (!res)
1876 return NULL;
1877 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001878 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001880}
1881
Victor Stinnere57b1c02011-09-28 22:20:48 +02001882static PyObject*
1883_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884{
1885 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001886 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001887
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001888 if (size == 0) {
1889 Py_INCREF(unicode_empty);
1890 return unicode_empty;
1891 }
1892 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001893 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001894 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001895
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001896 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001897 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 if (!res)
1899 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001900 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001902 else {
1903 _PyUnicode_CONVERT_BYTES(
1904 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1905 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001906 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 return res;
1908}
1909
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910static PyObject*
1911_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912{
1913 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001915
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916 if (size == 0) {
1917 Py_INCREF(unicode_empty);
1918 return unicode_empty;
1919 }
1920 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001921 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001922 return get_latin1_char((unsigned char)u[0]);
1923
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001924 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001925 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 if (!res)
1927 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001928 if (max_char < 256)
1929 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1930 PyUnicode_1BYTE_DATA(res));
1931 else if (max_char < 0x10000)
1932 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1933 PyUnicode_2BYTE_DATA(res));
1934 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001936 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 return res;
1938}
1939
1940PyObject*
1941PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1942{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001943 if (size < 0) {
1944 PyErr_SetString(PyExc_ValueError, "size must be positive");
1945 return NULL;
1946 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001947 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001949 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001951 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001953 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001954 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001955 PyErr_SetString(PyExc_SystemError, "invalid kind");
1956 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958}
1959
Victor Stinnerece58de2012-04-23 23:36:38 +02001960Py_UCS4
1961_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
1962{
1963 enum PyUnicode_Kind kind;
1964 void *startptr, *endptr;
1965
1966 assert(PyUnicode_IS_READY(unicode));
1967 assert(0 <= start);
1968 assert(end <= PyUnicode_GET_LENGTH(unicode));
1969 assert(start <= end);
1970
1971 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
1972 return PyUnicode_MAX_CHAR_VALUE(unicode);
1973
1974 if (start == end)
1975 return 127;
1976
1977 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04001978 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04001979 endptr = (char *)startptr + end * kind;
1980 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04001981 switch(kind) {
1982 case PyUnicode_1BYTE_KIND:
1983 return ucs1lib_find_max_char(startptr, endptr);
1984 case PyUnicode_2BYTE_KIND:
1985 return ucs2lib_find_max_char(startptr, endptr);
1986 case PyUnicode_4BYTE_KIND:
1987 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02001988 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04001989 assert(0);
1990 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02001991 }
1992}
1993
Victor Stinner25a4b292011-10-06 12:31:55 +02001994/* Ensure that a string uses the most efficient storage, if it is not the
1995 case: create a new string with of the right kind. Write NULL into *p_unicode
1996 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001997static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001998unicode_adjust_maxchar(PyObject **p_unicode)
1999{
2000 PyObject *unicode, *copy;
2001 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002002 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002003 unsigned int kind;
2004
2005 assert(p_unicode != NULL);
2006 unicode = *p_unicode;
2007 assert(PyUnicode_IS_READY(unicode));
2008 if (PyUnicode_IS_ASCII(unicode))
2009 return;
2010
2011 len = PyUnicode_GET_LENGTH(unicode);
2012 kind = PyUnicode_KIND(unicode);
2013 if (kind == PyUnicode_1BYTE_KIND) {
2014 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002015 max_char = ucs1lib_find_max_char(u, u + len);
2016 if (max_char >= 128)
2017 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002018 }
2019 else if (kind == PyUnicode_2BYTE_KIND) {
2020 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002021 max_char = ucs2lib_find_max_char(u, u + len);
2022 if (max_char >= 256)
2023 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002024 }
2025 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002026 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002027 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002028 max_char = ucs4lib_find_max_char(u, u + len);
2029 if (max_char >= 0x10000)
2030 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002031 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002032 copy = PyUnicode_New(len, max_char);
2033 copy_characters(copy, 0, unicode, 0, len);
2034 Py_DECREF(unicode);
2035 *p_unicode = copy;
2036}
2037
Victor Stinner034f6cf2011-09-30 02:26:44 +02002038PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002039_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002040{
Victor Stinner87af4f22011-11-21 23:03:47 +01002041 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002042 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002043
Victor Stinner034f6cf2011-09-30 02:26:44 +02002044 if (!PyUnicode_Check(unicode)) {
2045 PyErr_BadInternalCall();
2046 return NULL;
2047 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002048 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002049 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002050
Victor Stinner87af4f22011-11-21 23:03:47 +01002051 length = PyUnicode_GET_LENGTH(unicode);
2052 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002053 if (!copy)
2054 return NULL;
2055 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2056
Victor Stinner87af4f22011-11-21 23:03:47 +01002057 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2058 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002059 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002060 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002061}
2062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063
Victor Stinnerbc603d12011-10-02 01:00:40 +02002064/* Widen Unicode objects to larger buffers. Don't write terminating null
2065 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066
2067void*
2068_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2069{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002070 Py_ssize_t len;
2071 void *result;
2072 unsigned int skind;
2073
Benjamin Petersonbac79492012-01-14 13:34:47 -05002074 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002075 return NULL;
2076
2077 len = PyUnicode_GET_LENGTH(s);
2078 skind = PyUnicode_KIND(s);
2079 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002080 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 return NULL;
2082 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002083 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002084 case PyUnicode_2BYTE_KIND:
2085 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2086 if (!result)
2087 return PyErr_NoMemory();
2088 assert(skind == PyUnicode_1BYTE_KIND);
2089 _PyUnicode_CONVERT_BYTES(
2090 Py_UCS1, Py_UCS2,
2091 PyUnicode_1BYTE_DATA(s),
2092 PyUnicode_1BYTE_DATA(s) + len,
2093 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002094 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002095 case PyUnicode_4BYTE_KIND:
2096 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2097 if (!result)
2098 return PyErr_NoMemory();
2099 if (skind == PyUnicode_2BYTE_KIND) {
2100 _PyUnicode_CONVERT_BYTES(
2101 Py_UCS2, Py_UCS4,
2102 PyUnicode_2BYTE_DATA(s),
2103 PyUnicode_2BYTE_DATA(s) + len,
2104 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002105 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002106 else {
2107 assert(skind == PyUnicode_1BYTE_KIND);
2108 _PyUnicode_CONVERT_BYTES(
2109 Py_UCS1, Py_UCS4,
2110 PyUnicode_1BYTE_DATA(s),
2111 PyUnicode_1BYTE_DATA(s) + len,
2112 result);
2113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002115 default:
2116 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117 }
Victor Stinner01698042011-10-04 00:04:26 +02002118 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 return NULL;
2120}
2121
2122static Py_UCS4*
2123as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2124 int copy_null)
2125{
2126 int kind;
2127 void *data;
2128 Py_ssize_t len, targetlen;
2129 if (PyUnicode_READY(string) == -1)
2130 return NULL;
2131 kind = PyUnicode_KIND(string);
2132 data = PyUnicode_DATA(string);
2133 len = PyUnicode_GET_LENGTH(string);
2134 targetlen = len;
2135 if (copy_null)
2136 targetlen++;
2137 if (!target) {
2138 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2139 PyErr_NoMemory();
2140 return NULL;
2141 }
2142 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2143 if (!target) {
2144 PyErr_NoMemory();
2145 return NULL;
2146 }
2147 }
2148 else {
2149 if (targetsize < targetlen) {
2150 PyErr_Format(PyExc_SystemError,
2151 "string is longer than the buffer");
2152 if (copy_null && 0 < targetsize)
2153 target[0] = 0;
2154 return NULL;
2155 }
2156 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002157 if (kind == PyUnicode_1BYTE_KIND) {
2158 Py_UCS1 *start = (Py_UCS1 *) data;
2159 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002161 else if (kind == PyUnicode_2BYTE_KIND) {
2162 Py_UCS2 *start = (Py_UCS2 *) data;
2163 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2164 }
2165 else {
2166 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 if (copy_null)
2170 target[len] = 0;
2171 return target;
2172}
2173
2174Py_UCS4*
2175PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2176 int copy_null)
2177{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002178 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 PyErr_BadInternalCall();
2180 return NULL;
2181 }
2182 return as_ucs4(string, target, targetsize, copy_null);
2183}
2184
2185Py_UCS4*
2186PyUnicode_AsUCS4Copy(PyObject *string)
2187{
2188 return as_ucs4(string, NULL, 0, 1);
2189}
2190
2191#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002192
Alexander Belopolsky40018472011-02-26 01:02:56 +00002193PyObject *
2194PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002197 if (size == 0) {
2198 Py_INCREF(unicode_empty);
2199 return unicode_empty;
2200 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002201 PyErr_BadInternalCall();
2202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 }
2204
Martin v. Löwis790465f2008-04-05 20:41:37 +00002205 if (size == -1) {
2206 size = wcslen(w);
2207 }
2208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210}
2211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002213
Walter Dörwald346737f2007-05-31 10:44:43 +00002214static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002215makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2216 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002217{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002218 *fmt++ = '%';
2219 if (width) {
2220 if (zeropad)
2221 *fmt++ = '0';
2222 fmt += sprintf(fmt, "%d", width);
2223 }
2224 if (precision)
2225 fmt += sprintf(fmt, ".%d", precision);
2226 if (longflag)
2227 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002228 else if (longlongflag) {
2229 /* longlongflag should only ever be nonzero on machines with
2230 HAVE_LONG_LONG defined */
2231#ifdef HAVE_LONG_LONG
2232 char *f = PY_FORMAT_LONG_LONG;
2233 while (*f)
2234 *fmt++ = *f++;
2235#else
2236 /* we shouldn't ever get here */
2237 assert(0);
2238 *fmt++ = 'l';
2239#endif
2240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002241 else if (size_tflag) {
2242 char *f = PY_FORMAT_SIZE_T;
2243 while (*f)
2244 *fmt++ = *f++;
2245 }
2246 *fmt++ = c;
2247 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002248}
2249
Victor Stinner96865452011-03-01 23:44:09 +00002250/* helper for PyUnicode_FromFormatV() */
2251
2252static const char*
2253parse_format_flags(const char *f,
2254 int *p_width, int *p_precision,
2255 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2256{
2257 int width, precision, longflag, longlongflag, size_tflag;
2258
2259 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2260 f++;
2261 width = 0;
2262 while (Py_ISDIGIT((unsigned)*f))
2263 width = (width*10) + *f++ - '0';
2264 precision = 0;
2265 if (*f == '.') {
2266 f++;
2267 while (Py_ISDIGIT((unsigned)*f))
2268 precision = (precision*10) + *f++ - '0';
2269 if (*f == '%') {
2270 /* "%.3%s" => f points to "3" */
2271 f--;
2272 }
2273 }
2274 if (*f == '\0') {
2275 /* bogus format "%.1" => go backward, f points to "1" */
2276 f--;
2277 }
2278 if (p_width != NULL)
2279 *p_width = width;
2280 if (p_precision != NULL)
2281 *p_precision = precision;
2282
2283 /* Handle %ld, %lu, %lld and %llu. */
2284 longflag = 0;
2285 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002286 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002287
2288 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002289 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002290 longflag = 1;
2291 ++f;
2292 }
2293#ifdef HAVE_LONG_LONG
2294 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002295 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002296 longlongflag = 1;
2297 f += 2;
2298 }
2299#endif
2300 }
2301 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002302 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002303 size_tflag = 1;
2304 ++f;
2305 }
2306 if (p_longflag != NULL)
2307 *p_longflag = longflag;
2308 if (p_longlongflag != NULL)
2309 *p_longlongflag = longlongflag;
2310 if (p_size_tflag != NULL)
2311 *p_size_tflag = size_tflag;
2312 return f;
2313}
2314
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002315/* maximum number of characters required for output of %ld. 21 characters
2316 allows for 64-bit integers (in decimal) and an optional sign. */
2317#define MAX_LONG_CHARS 21
2318/* maximum number of characters required for output of %lld.
2319 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2320 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2321#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2322
Walter Dörwaldd2034312007-05-18 16:29:38 +00002323PyObject *
2324PyUnicode_FromFormatV(const char *format, va_list vargs)
2325{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002326 va_list count;
2327 Py_ssize_t callcount = 0;
2328 PyObject **callresults = NULL;
2329 PyObject **callresult = NULL;
2330 Py_ssize_t n = 0;
2331 int width = 0;
2332 int precision = 0;
2333 int zeropad;
2334 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002335 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002336 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002337 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002338 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2339 Py_UCS4 argmaxchar;
2340 Py_ssize_t numbersize = 0;
2341 char *numberresults = NULL;
2342 char *numberresult = NULL;
2343 Py_ssize_t i;
2344 int kind;
2345 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002346
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002347 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002348 /* step 1: count the number of %S/%R/%A/%s format specifications
2349 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2350 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002352 * also estimate a upper bound for all the number formats in the string,
2353 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002355 for (f = format; *f; f++) {
2356 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002357 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002358 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2359 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2360 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2361 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002363 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002364#ifdef HAVE_LONG_LONG
2365 if (longlongflag) {
2366 if (width < MAX_LONG_LONG_CHARS)
2367 width = MAX_LONG_LONG_CHARS;
2368 }
2369 else
2370#endif
2371 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2372 including sign. Decimal takes the most space. This
2373 isn't enough for octal. If a width is specified we
2374 need more (which we allocate later). */
2375 if (width < MAX_LONG_CHARS)
2376 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377
2378 /* account for the size + '\0' to separate numbers
2379 inside of the numberresults buffer */
2380 numbersize += (width + 1);
2381 }
2382 }
2383 else if ((unsigned char)*f > 127) {
2384 PyErr_Format(PyExc_ValueError,
2385 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2386 "string, got a non-ASCII byte: 0x%02x",
2387 (unsigned char)*f);
2388 return NULL;
2389 }
2390 }
2391 /* step 2: allocate memory for the results of
2392 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2393 if (callcount) {
2394 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2395 if (!callresults) {
2396 PyErr_NoMemory();
2397 return NULL;
2398 }
2399 callresult = callresults;
2400 }
2401 /* step 2.5: allocate memory for the results of formating numbers */
2402 if (numbersize) {
2403 numberresults = PyObject_Malloc(numbersize);
2404 if (!numberresults) {
2405 PyErr_NoMemory();
2406 goto fail;
2407 }
2408 numberresult = numberresults;
2409 }
2410
2411 /* step 3: format numbers and figure out how large a buffer we need */
2412 for (f = format; *f; f++) {
2413 if (*f == '%') {
2414 const char* p;
2415 int longflag;
2416 int longlongflag;
2417 int size_tflag;
2418 int numprinted;
2419
2420 p = f;
2421 zeropad = (f[1] == '0');
2422 f = parse_format_flags(f, &width, &precision,
2423 &longflag, &longlongflag, &size_tflag);
2424 switch (*f) {
2425 case 'c':
2426 {
2427 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002428 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 n++;
2430 break;
2431 }
2432 case '%':
2433 n++;
2434 break;
2435 case 'i':
2436 case 'd':
2437 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2438 width, precision, *f);
2439 if (longflag)
2440 numprinted = sprintf(numberresult, fmt,
2441 va_arg(count, long));
2442#ifdef HAVE_LONG_LONG
2443 else if (longlongflag)
2444 numprinted = sprintf(numberresult, fmt,
2445 va_arg(count, PY_LONG_LONG));
2446#endif
2447 else if (size_tflag)
2448 numprinted = sprintf(numberresult, fmt,
2449 va_arg(count, Py_ssize_t));
2450 else
2451 numprinted = sprintf(numberresult, fmt,
2452 va_arg(count, int));
2453 n += numprinted;
2454 /* advance by +1 to skip over the '\0' */
2455 numberresult += (numprinted + 1);
2456 assert(*(numberresult - 1) == '\0');
2457 assert(*(numberresult - 2) != '\0');
2458 assert(numprinted >= 0);
2459 assert(numberresult <= numberresults + numbersize);
2460 break;
2461 case 'u':
2462 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2463 width, precision, 'u');
2464 if (longflag)
2465 numprinted = sprintf(numberresult, fmt,
2466 va_arg(count, unsigned long));
2467#ifdef HAVE_LONG_LONG
2468 else if (longlongflag)
2469 numprinted = sprintf(numberresult, fmt,
2470 va_arg(count, unsigned PY_LONG_LONG));
2471#endif
2472 else if (size_tflag)
2473 numprinted = sprintf(numberresult, fmt,
2474 va_arg(count, size_t));
2475 else
2476 numprinted = sprintf(numberresult, fmt,
2477 va_arg(count, unsigned int));
2478 n += numprinted;
2479 numberresult += (numprinted + 1);
2480 assert(*(numberresult - 1) == '\0');
2481 assert(*(numberresult - 2) != '\0');
2482 assert(numprinted >= 0);
2483 assert(numberresult <= numberresults + numbersize);
2484 break;
2485 case 'x':
2486 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2487 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2488 n += numprinted;
2489 numberresult += (numprinted + 1);
2490 assert(*(numberresult - 1) == '\0');
2491 assert(*(numberresult - 2) != '\0');
2492 assert(numprinted >= 0);
2493 assert(numberresult <= numberresults + numbersize);
2494 break;
2495 case 'p':
2496 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2497 /* %p is ill-defined: ensure leading 0x. */
2498 if (numberresult[1] == 'X')
2499 numberresult[1] = 'x';
2500 else if (numberresult[1] != 'x') {
2501 memmove(numberresult + 2, numberresult,
2502 strlen(numberresult) + 1);
2503 numberresult[0] = '0';
2504 numberresult[1] = 'x';
2505 numprinted += 2;
2506 }
2507 n += numprinted;
2508 numberresult += (numprinted + 1);
2509 assert(*(numberresult - 1) == '\0');
2510 assert(*(numberresult - 2) != '\0');
2511 assert(numprinted >= 0);
2512 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002513 break;
2514 case 's':
2515 {
2516 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002517 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002518 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002519 if (!str)
2520 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 /* since PyUnicode_DecodeUTF8 returns already flexible
2522 unicode objects, there is no need to call ready on them */
2523 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002524 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002526 /* Remember the str and switch to the next slot */
2527 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002528 break;
2529 }
2530 case 'U':
2531 {
2532 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002533 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 if (PyUnicode_READY(obj) == -1)
2535 goto fail;
2536 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002537 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002538 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 break;
2540 }
2541 case 'V':
2542 {
2543 PyObject *obj = va_arg(count, PyObject *);
2544 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002545 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002547 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002548 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 if (PyUnicode_READY(obj) == -1)
2550 goto fail;
2551 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002552 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002554 *callresult++ = NULL;
2555 }
2556 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002557 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002558 if (!str_obj)
2559 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002560 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002561 Py_DECREF(str_obj);
2562 goto fail;
2563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002565 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002567 *callresult++ = str_obj;
2568 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 break;
2570 }
2571 case 'S':
2572 {
2573 PyObject *obj = va_arg(count, PyObject *);
2574 PyObject *str;
2575 assert(obj);
2576 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002577 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002578 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002579 if (PyUnicode_READY(str) == -1) {
2580 Py_DECREF(str);
2581 goto fail;
2582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002583 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002584 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002585 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 /* Remember the str and switch to the next slot */
2587 *callresult++ = str;
2588 break;
2589 }
2590 case 'R':
2591 {
2592 PyObject *obj = va_arg(count, PyObject *);
2593 PyObject *repr;
2594 assert(obj);
2595 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002596 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002598 if (PyUnicode_READY(repr) == -1) {
2599 Py_DECREF(repr);
2600 goto fail;
2601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002603 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 /* Remember the repr and switch to the next slot */
2606 *callresult++ = repr;
2607 break;
2608 }
2609 case 'A':
2610 {
2611 PyObject *obj = va_arg(count, PyObject *);
2612 PyObject *ascii;
2613 assert(obj);
2614 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002615 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002617 if (PyUnicode_READY(ascii) == -1) {
2618 Py_DECREF(ascii);
2619 goto fail;
2620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002622 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002624 /* Remember the repr and switch to the next slot */
2625 *callresult++ = ascii;
2626 break;
2627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 default:
2629 /* if we stumble upon an unknown
2630 formatting code, copy the rest of
2631 the format string to the output
2632 string. (we cannot just skip the
2633 code, since there's no way to know
2634 what's in the argument list) */
2635 n += strlen(p);
2636 goto expand;
2637 }
2638 } else
2639 n++;
2640 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002641 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002643 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002644 we don't have to resize the string.
2645 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002646 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 if (!string)
2648 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649 kind = PyUnicode_KIND(string);
2650 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002656 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002657
2658 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2660 /* checking for == because the last argument could be a empty
2661 string, which causes i to point to end, the assert at the end of
2662 the loop */
2663 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002664
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 switch (*f) {
2666 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002667 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 const int ordinal = va_arg(vargs, int);
2669 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002671 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002672 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002677 {
2678 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679 /* unused, since we already have the result */
2680 if (*f == 'p')
2681 (void) va_arg(vargs, void *);
2682 else
2683 (void) va_arg(vargs, int);
2684 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002685 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002687 i += written;
2688 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002689 assert(*numberresult == '\0');
2690 numberresult++;
2691 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002692 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002693 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 case 's':
2695 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002696 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002698 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002699 size = PyUnicode_GET_LENGTH(*callresult);
2700 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002701 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002703 /* We're done with the unicode()/repr() => forget it */
2704 Py_DECREF(*callresult);
2705 /* switch to next unicode()/repr() result */
2706 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 break;
2708 }
2709 case 'U':
2710 {
2711 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002712 Py_ssize_t size;
2713 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2714 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002715 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002717 break;
2718 }
2719 case 'V':
2720 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002722 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002723 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002724 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725 size = PyUnicode_GET_LENGTH(obj);
2726 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002727 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002729 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730 size = PyUnicode_GET_LENGTH(*callresult);
2731 assert(PyUnicode_KIND(*callresult) <=
2732 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002733 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002735 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002737 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002738 break;
2739 }
2740 case 'S':
2741 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002742 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002744 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 /* unused, since we already have the result */
2746 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002748 copy_characters(string, i, *callresult, 0, size);
2749 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002750 /* We're done with the unicode()/repr() => forget it */
2751 Py_DECREF(*callresult);
2752 /* switch to next unicode()/repr() result */
2753 ++callresult;
2754 break;
2755 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002756 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002757 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002758 break;
2759 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002760 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002762 goto end;
2763 }
Victor Stinner1205f272010-09-11 00:54:47 +00002764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765 else {
2766 assert(i < PyUnicode_GET_LENGTH(string));
2767 PyUnicode_WRITE(kind, data, i++, *f);
2768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002771
Benjamin Peterson29060642009-01-31 22:14:21 +00002772 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002773 if (callresults)
2774 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 if (numberresults)
2776 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002777 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002778 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002779 if (callresults) {
2780 PyObject **callresult2 = callresults;
2781 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002782 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 ++callresult2;
2784 }
2785 PyObject_Free(callresults);
2786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 if (numberresults)
2788 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002789 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002790}
2791
Walter Dörwaldd2034312007-05-18 16:29:38 +00002792PyObject *
2793PyUnicode_FromFormat(const char *format, ...)
2794{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002795 PyObject* ret;
2796 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002797
2798#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002799 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002800#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002801 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002803 ret = PyUnicode_FromFormatV(format, vargs);
2804 va_end(vargs);
2805 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002806}
2807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808#ifdef HAVE_WCHAR_H
2809
Victor Stinner5593d8a2010-10-02 11:11:27 +00002810/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2811 convert a Unicode object to a wide character string.
2812
Victor Stinnerd88d9832011-09-06 02:00:05 +02002813 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814 character) required to convert the unicode object. Ignore size argument.
2815
Victor Stinnerd88d9832011-09-06 02:00:05 +02002816 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002817 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002818 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002819static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002820unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002821 wchar_t *w,
2822 Py_ssize_t size)
2823{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002824 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 const wchar_t *wstr;
2826
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002827 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828 if (wstr == NULL)
2829 return -1;
2830
Victor Stinner5593d8a2010-10-02 11:11:27 +00002831 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002832 if (size > res)
2833 size = res + 1;
2834 else
2835 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002836 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002837 return res;
2838 }
2839 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002840 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002841}
2842
2843Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002844PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002845 wchar_t *w,
2846 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847{
2848 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002849 PyErr_BadInternalCall();
2850 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002852 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853}
2854
Victor Stinner137c34c2010-09-29 10:25:54 +00002855wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002856PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002857 Py_ssize_t *size)
2858{
2859 wchar_t* buffer;
2860 Py_ssize_t buflen;
2861
2862 if (unicode == NULL) {
2863 PyErr_BadInternalCall();
2864 return NULL;
2865 }
2866
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002867 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 if (buflen == -1)
2869 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002871 PyErr_NoMemory();
2872 return NULL;
2873 }
2874
Victor Stinner137c34c2010-09-29 10:25:54 +00002875 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2876 if (buffer == NULL) {
2877 PyErr_NoMemory();
2878 return NULL;
2879 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002880 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002881 if (buflen == -1)
2882 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002883 if (size != NULL)
2884 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002885 return buffer;
2886}
2887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002888#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889
Alexander Belopolsky40018472011-02-26 01:02:56 +00002890PyObject *
2891PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002894 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002895 PyErr_SetString(PyExc_ValueError,
2896 "chr() arg not in range(0x110000)");
2897 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002898 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 if (ordinal < 256)
2901 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002903 v = PyUnicode_New(1, ordinal);
2904 if (v == NULL)
2905 return NULL;
2906 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002907 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002908 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002909}
2910
Alexander Belopolsky40018472011-02-26 01:02:56 +00002911PyObject *
2912PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002914 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002916 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002917 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002918 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 Py_INCREF(obj);
2920 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002921 }
2922 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 /* For a Unicode subtype that's not a Unicode object,
2924 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002925 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002926 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002927 PyErr_Format(PyExc_TypeError,
2928 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002929 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002930 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002931}
2932
Alexander Belopolsky40018472011-02-26 01:02:56 +00002933PyObject *
2934PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002935 const char *encoding,
2936 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002937{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002939 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002940
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002942 PyErr_BadInternalCall();
2943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002945
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002946 /* Decoding bytes objects is the most common case and should be fast */
2947 if (PyBytes_Check(obj)) {
2948 if (PyBytes_GET_SIZE(obj) == 0) {
2949 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002950 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002951 }
2952 else {
2953 v = PyUnicode_Decode(
2954 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2955 encoding, errors);
2956 }
2957 return v;
2958 }
2959
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002960 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002961 PyErr_SetString(PyExc_TypeError,
2962 "decoding str is not supported");
2963 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002964 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002965
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002966 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2967 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2968 PyErr_Format(PyExc_TypeError,
2969 "coercing to str: need bytes, bytearray "
2970 "or buffer-like object, %.80s found",
2971 Py_TYPE(obj)->tp_name);
2972 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002973 }
Tim Petersced69f82003-09-16 20:30:58 +00002974
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002975 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002976 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002977 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 }
Tim Petersced69f82003-09-16 20:30:58 +00002979 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002980 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002981
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002982 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002983 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984}
2985
Victor Stinner600d3be2010-06-10 12:00:55 +00002986/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002987 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2988 1 on success. */
2989static int
2990normalize_encoding(const char *encoding,
2991 char *lower,
2992 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002994 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002995 char *l;
2996 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002998 if (encoding == NULL) {
2999 strcpy(lower, "utf-8");
3000 return 1;
3001 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003002 e = encoding;
3003 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003004 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003005 while (*e) {
3006 if (l == l_end)
3007 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003008 if (Py_ISUPPER(*e)) {
3009 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003010 }
3011 else if (*e == '_') {
3012 *l++ = '-';
3013 e++;
3014 }
3015 else {
3016 *l++ = *e++;
3017 }
3018 }
3019 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003020 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003021}
3022
Alexander Belopolsky40018472011-02-26 01:02:56 +00003023PyObject *
3024PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003025 Py_ssize_t size,
3026 const char *encoding,
3027 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003028{
3029 PyObject *buffer = NULL, *unicode;
3030 Py_buffer info;
3031 char lower[11]; /* Enough for any encoding shortcut */
3032
Fred Drakee4315f52000-05-09 19:53:39 +00003033 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003034 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003035 if ((strcmp(lower, "utf-8") == 0) ||
3036 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003037 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003038 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003039 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003040 (strcmp(lower, "iso-8859-1") == 0))
3041 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003042#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003043 else if (strcmp(lower, "mbcs") == 0)
3044 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003045#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003046 else if (strcmp(lower, "ascii") == 0)
3047 return PyUnicode_DecodeASCII(s, size, errors);
3048 else if (strcmp(lower, "utf-16") == 0)
3049 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3050 else if (strcmp(lower, "utf-32") == 0)
3051 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053
3054 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003055 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003056 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003057 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003058 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 if (buffer == NULL)
3060 goto onError;
3061 unicode = PyCodec_Decode(buffer, encoding, errors);
3062 if (unicode == NULL)
3063 goto onError;
3064 if (!PyUnicode_Check(unicode)) {
3065 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003066 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003067 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 Py_DECREF(unicode);
3069 goto onError;
3070 }
3071 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003072 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003073
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 Py_XDECREF(buffer);
3076 return NULL;
3077}
3078
Alexander Belopolsky40018472011-02-26 01:02:56 +00003079PyObject *
3080PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003081 const char *encoding,
3082 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003083{
3084 PyObject *v;
3085
3086 if (!PyUnicode_Check(unicode)) {
3087 PyErr_BadArgument();
3088 goto onError;
3089 }
3090
3091 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003092 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003093
3094 /* Decode via the codec registry */
3095 v = PyCodec_Decode(unicode, encoding, errors);
3096 if (v == NULL)
3097 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003098 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003099
Benjamin Peterson29060642009-01-31 22:14:21 +00003100 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003101 return NULL;
3102}
3103
Alexander Belopolsky40018472011-02-26 01:02:56 +00003104PyObject *
3105PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003106 const char *encoding,
3107 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003108{
3109 PyObject *v;
3110
3111 if (!PyUnicode_Check(unicode)) {
3112 PyErr_BadArgument();
3113 goto onError;
3114 }
3115
3116 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003117 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003118
3119 /* Decode via the codec registry */
3120 v = PyCodec_Decode(unicode, encoding, errors);
3121 if (v == NULL)
3122 goto onError;
3123 if (!PyUnicode_Check(v)) {
3124 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003125 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003126 Py_TYPE(v)->tp_name);
3127 Py_DECREF(v);
3128 goto onError;
3129 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003130 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003131
Benjamin Peterson29060642009-01-31 22:14:21 +00003132 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003133 return NULL;
3134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 Py_ssize_t size,
3139 const char *encoding,
3140 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141{
3142 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003143
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 unicode = PyUnicode_FromUnicode(s, size);
3145 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3148 Py_DECREF(unicode);
3149 return v;
3150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 const char *encoding,
3155 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003156{
3157 PyObject *v;
3158
3159 if (!PyUnicode_Check(unicode)) {
3160 PyErr_BadArgument();
3161 goto onError;
3162 }
3163
3164 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003165 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003166
3167 /* Encode via the codec registry */
3168 v = PyCodec_Encode(unicode, encoding, errors);
3169 if (v == NULL)
3170 goto onError;
3171 return v;
3172
Benjamin Peterson29060642009-01-31 22:14:21 +00003173 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003174 return NULL;
3175}
3176
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003177static size_t
3178wcstombs_errorpos(const wchar_t *wstr)
3179{
3180 size_t len;
3181#if SIZEOF_WCHAR_T == 2
3182 wchar_t buf[3];
3183#else
3184 wchar_t buf[2];
3185#endif
3186 char outbuf[MB_LEN_MAX];
3187 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003188
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003189#if SIZEOF_WCHAR_T == 2
3190 buf[2] = 0;
3191#else
3192 buf[1] = 0;
3193#endif
3194 start = wstr;
3195 while (*wstr != L'\0')
3196 {
3197 previous = wstr;
3198#if SIZEOF_WCHAR_T == 2
3199 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3200 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3201 {
3202 buf[0] = wstr[0];
3203 buf[1] = wstr[1];
3204 wstr += 2;
3205 }
3206 else {
3207 buf[0] = *wstr;
3208 buf[1] = 0;
3209 wstr++;
3210 }
3211#else
3212 buf[0] = *wstr;
3213 wstr++;
3214#endif
3215 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003216 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003217 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003218 }
3219
3220 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003221 return 0;
3222}
3223
Victor Stinner1b579672011-12-17 05:47:23 +01003224static int
3225locale_error_handler(const char *errors, int *surrogateescape)
3226{
3227 if (errors == NULL) {
3228 *surrogateescape = 0;
3229 return 0;
3230 }
3231
3232 if (strcmp(errors, "strict") == 0) {
3233 *surrogateescape = 0;
3234 return 0;
3235 }
3236 if (strcmp(errors, "surrogateescape") == 0) {
3237 *surrogateescape = 1;
3238 return 0;
3239 }
3240 PyErr_Format(PyExc_ValueError,
3241 "only 'strict' and 'surrogateescape' error handlers "
3242 "are supported, not '%s'",
3243 errors);
3244 return -1;
3245}
3246
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003247PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003248PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003249{
3250 Py_ssize_t wlen, wlen2;
3251 wchar_t *wstr;
3252 PyObject *bytes = NULL;
3253 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003254 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003255 PyObject *exc;
3256 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003257 int surrogateescape;
3258
3259 if (locale_error_handler(errors, &surrogateescape) < 0)
3260 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003261
3262 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3263 if (wstr == NULL)
3264 return NULL;
3265
3266 wlen2 = wcslen(wstr);
3267 if (wlen2 != wlen) {
3268 PyMem_Free(wstr);
3269 PyErr_SetString(PyExc_TypeError, "embedded null character");
3270 return NULL;
3271 }
3272
3273 if (surrogateescape) {
3274 /* locale encoding with surrogateescape */
3275 char *str;
3276
3277 str = _Py_wchar2char(wstr, &error_pos);
3278 if (str == NULL) {
3279 if (error_pos == (size_t)-1) {
3280 PyErr_NoMemory();
3281 PyMem_Free(wstr);
3282 return NULL;
3283 }
3284 else {
3285 goto encode_error;
3286 }
3287 }
3288 PyMem_Free(wstr);
3289
3290 bytes = PyBytes_FromString(str);
3291 PyMem_Free(str);
3292 }
3293 else {
3294 size_t len, len2;
3295
3296 len = wcstombs(NULL, wstr, 0);
3297 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003298 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003299 goto encode_error;
3300 }
3301
3302 bytes = PyBytes_FromStringAndSize(NULL, len);
3303 if (bytes == NULL) {
3304 PyMem_Free(wstr);
3305 return NULL;
3306 }
3307
3308 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3309 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003310 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003311 goto encode_error;
3312 }
3313 PyMem_Free(wstr);
3314 }
3315 return bytes;
3316
3317encode_error:
3318 errmsg = strerror(errno);
3319 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003320
3321 if (error_pos == (size_t)-1)
3322 error_pos = wcstombs_errorpos(wstr);
3323
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003324 PyMem_Free(wstr);
3325 Py_XDECREF(bytes);
3326
Victor Stinner2f197072011-12-17 07:08:30 +01003327 if (errmsg != NULL) {
3328 size_t errlen;
3329 wstr = _Py_char2wchar(errmsg, &errlen);
3330 if (wstr != NULL) {
3331 reason = PyUnicode_FromWideChar(wstr, errlen);
3332 PyMem_Free(wstr);
3333 } else
3334 errmsg = NULL;
3335 }
3336 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003337 reason = PyUnicode_FromString(
3338 "wcstombs() encountered an unencodable "
3339 "wide character");
3340 if (reason == NULL)
3341 return NULL;
3342
3343 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3344 "locale", unicode,
3345 (Py_ssize_t)error_pos,
3346 (Py_ssize_t)(error_pos+1),
3347 reason);
3348 Py_DECREF(reason);
3349 if (exc != NULL) {
3350 PyCodec_StrictErrors(exc);
3351 Py_XDECREF(exc);
3352 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353 return NULL;
3354}
3355
Victor Stinnerad158722010-10-27 00:25:46 +00003356PyObject *
3357PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003358{
Victor Stinner99b95382011-07-04 14:23:54 +02003359#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003360 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003361#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003362 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003363#else
Victor Stinner793b5312011-04-27 00:24:21 +02003364 PyInterpreterState *interp = PyThreadState_GET()->interp;
3365 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3366 cannot use it to encode and decode filenames before it is loaded. Load
3367 the Python codec requires to encode at least its own filename. Use the C
3368 version of the locale codec until the codec registry is initialized and
3369 the Python codec is loaded.
3370
3371 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3372 cannot only rely on it: check also interp->fscodec_initialized for
3373 subinterpreters. */
3374 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003375 return PyUnicode_AsEncodedString(unicode,
3376 Py_FileSystemDefaultEncoding,
3377 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003378 }
3379 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003380 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003381 }
Victor Stinnerad158722010-10-27 00:25:46 +00003382#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003383}
3384
Alexander Belopolsky40018472011-02-26 01:02:56 +00003385PyObject *
3386PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003387 const char *encoding,
3388 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389{
3390 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003391 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003392
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 if (!PyUnicode_Check(unicode)) {
3394 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 }
Fred Drakee4315f52000-05-09 19:53:39 +00003397
Fred Drakee4315f52000-05-09 19:53:39 +00003398 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003399 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003400 if ((strcmp(lower, "utf-8") == 0) ||
3401 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003402 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003403 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003404 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003405 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003406 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003407 }
Victor Stinner37296e82010-06-10 13:36:23 +00003408 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003409 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003410 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003411 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003412#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003413 else if (strcmp(lower, "mbcs") == 0)
3414 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003415#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003416 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003418 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419
3420 /* Encode via the codec registry */
3421 v = PyCodec_Encode(unicode, encoding, errors);
3422 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003423 return NULL;
3424
3425 /* The normal path */
3426 if (PyBytes_Check(v))
3427 return v;
3428
3429 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003430 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003431 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003432 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003433
3434 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3435 "encoder %s returned bytearray instead of bytes",
3436 encoding);
3437 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003438 Py_DECREF(v);
3439 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003440 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003441
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003442 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3443 Py_DECREF(v);
3444 return b;
3445 }
3446
3447 PyErr_Format(PyExc_TypeError,
3448 "encoder did not return a bytes object (type=%.400s)",
3449 Py_TYPE(v)->tp_name);
3450 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003451 return NULL;
3452}
3453
Alexander Belopolsky40018472011-02-26 01:02:56 +00003454PyObject *
3455PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003456 const char *encoding,
3457 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003458{
3459 PyObject *v;
3460
3461 if (!PyUnicode_Check(unicode)) {
3462 PyErr_BadArgument();
3463 goto onError;
3464 }
3465
3466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003468
3469 /* Encode via the codec registry */
3470 v = PyCodec_Encode(unicode, encoding, errors);
3471 if (v == NULL)
3472 goto onError;
3473 if (!PyUnicode_Check(v)) {
3474 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003475 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003476 Py_TYPE(v)->tp_name);
3477 Py_DECREF(v);
3478 goto onError;
3479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003481
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 return NULL;
3484}
3485
Victor Stinner2f197072011-12-17 07:08:30 +01003486static size_t
3487mbstowcs_errorpos(const char *str, size_t len)
3488{
3489#ifdef HAVE_MBRTOWC
3490 const char *start = str;
3491 mbstate_t mbs;
3492 size_t converted;
3493 wchar_t ch;
3494
3495 memset(&mbs, 0, sizeof mbs);
3496 while (len)
3497 {
3498 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3499 if (converted == 0)
3500 /* Reached end of string */
3501 break;
3502 if (converted == (size_t)-1 || converted == (size_t)-2) {
3503 /* Conversion error or incomplete character */
3504 return str - start;
3505 }
3506 else {
3507 str += converted;
3508 len -= converted;
3509 }
3510 }
3511 /* failed to find the undecodable byte sequence */
3512 return 0;
3513#endif
3514 return 0;
3515}
3516
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003517PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003518PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003519 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003520{
3521 wchar_t smallbuf[256];
3522 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3523 wchar_t *wstr;
3524 size_t wlen, wlen2;
3525 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003526 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003527 size_t error_pos;
3528 char *errmsg;
3529 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003530
3531 if (locale_error_handler(errors, &surrogateescape) < 0)
3532 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003533
3534 if (str[len] != '\0' || len != strlen(str)) {
3535 PyErr_SetString(PyExc_TypeError, "embedded null character");
3536 return NULL;
3537 }
3538
3539 if (surrogateescape)
3540 {
3541 wstr = _Py_char2wchar(str, &wlen);
3542 if (wstr == NULL) {
3543 if (wlen == (size_t)-1)
3544 PyErr_NoMemory();
3545 else
3546 PyErr_SetFromErrno(PyExc_OSError);
3547 return NULL;
3548 }
3549
3550 unicode = PyUnicode_FromWideChar(wstr, wlen);
3551 PyMem_Free(wstr);
3552 }
3553 else {
3554#ifndef HAVE_BROKEN_MBSTOWCS
3555 wlen = mbstowcs(NULL, str, 0);
3556#else
3557 wlen = len;
3558#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003559 if (wlen == (size_t)-1)
3560 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003561 if (wlen+1 <= smallbuf_len) {
3562 wstr = smallbuf;
3563 }
3564 else {
3565 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3566 return PyErr_NoMemory();
3567
3568 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3569 if (!wstr)
3570 return PyErr_NoMemory();
3571 }
3572
3573 /* This shouldn't fail now */
3574 wlen2 = mbstowcs(wstr, str, wlen+1);
3575 if (wlen2 == (size_t)-1) {
3576 if (wstr != smallbuf)
3577 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003578 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003579 }
3580#ifdef HAVE_BROKEN_MBSTOWCS
3581 assert(wlen2 == wlen);
3582#endif
3583 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3584 if (wstr != smallbuf)
3585 PyMem_Free(wstr);
3586 }
3587 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003588
3589decode_error:
3590 errmsg = strerror(errno);
3591 assert(errmsg != NULL);
3592
3593 error_pos = mbstowcs_errorpos(str, len);
3594 if (errmsg != NULL) {
3595 size_t errlen;
3596 wstr = _Py_char2wchar(errmsg, &errlen);
3597 if (wstr != NULL) {
3598 reason = PyUnicode_FromWideChar(wstr, errlen);
3599 PyMem_Free(wstr);
3600 } else
3601 errmsg = NULL;
3602 }
3603 if (errmsg == NULL)
3604 reason = PyUnicode_FromString(
3605 "mbstowcs() encountered an invalid multibyte sequence");
3606 if (reason == NULL)
3607 return NULL;
3608
3609 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3610 "locale", str, len,
3611 (Py_ssize_t)error_pos,
3612 (Py_ssize_t)(error_pos+1),
3613 reason);
3614 Py_DECREF(reason);
3615 if (exc != NULL) {
3616 PyCodec_StrictErrors(exc);
3617 Py_XDECREF(exc);
3618 }
3619 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003620}
3621
3622PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003623PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003624{
3625 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003626 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003627}
3628
3629
3630PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003631PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003632 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003633 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3634}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003635
Christian Heimes5894ba72007-11-04 11:43:14 +00003636PyObject*
3637PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3638{
Victor Stinner99b95382011-07-04 14:23:54 +02003639#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003640 return PyUnicode_DecodeMBCS(s, size, NULL);
3641#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003642 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003643#else
Victor Stinner793b5312011-04-27 00:24:21 +02003644 PyInterpreterState *interp = PyThreadState_GET()->interp;
3645 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3646 cannot use it to encode and decode filenames before it is loaded. Load
3647 the Python codec requires to encode at least its own filename. Use the C
3648 version of the locale codec until the codec registry is initialized and
3649 the Python codec is loaded.
3650
3651 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3652 cannot only rely on it: check also interp->fscodec_initialized for
3653 subinterpreters. */
3654 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003655 return PyUnicode_Decode(s, size,
3656 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003657 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003658 }
3659 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003660 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003661 }
Victor Stinnerad158722010-10-27 00:25:46 +00003662#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003663}
3664
Martin v. Löwis011e8422009-05-05 04:43:17 +00003665
3666int
Antoine Pitrou13348842012-01-29 18:36:34 +01003667_PyUnicode_HasNULChars(PyObject* s)
3668{
3669 static PyObject *nul = NULL;
3670
3671 if (nul == NULL)
3672 nul = PyUnicode_FromStringAndSize("\0", 1);
3673 if (nul == NULL)
3674 return -1;
3675 return PyUnicode_Contains(s, nul);
3676}
3677
3678
3679int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003680PyUnicode_FSConverter(PyObject* arg, void* addr)
3681{
3682 PyObject *output = NULL;
3683 Py_ssize_t size;
3684 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003685 if (arg == NULL) {
3686 Py_DECREF(*(PyObject**)addr);
3687 return 1;
3688 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003689 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003690 output = arg;
3691 Py_INCREF(output);
3692 }
3693 else {
3694 arg = PyUnicode_FromObject(arg);
3695 if (!arg)
3696 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003697 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003698 Py_DECREF(arg);
3699 if (!output)
3700 return 0;
3701 if (!PyBytes_Check(output)) {
3702 Py_DECREF(output);
3703 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3704 return 0;
3705 }
3706 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003707 size = PyBytes_GET_SIZE(output);
3708 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003709 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003710 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003711 Py_DECREF(output);
3712 return 0;
3713 }
3714 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003715 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003716}
3717
3718
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003719int
3720PyUnicode_FSDecoder(PyObject* arg, void* addr)
3721{
3722 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003723 if (arg == NULL) {
3724 Py_DECREF(*(PyObject**)addr);
3725 return 1;
3726 }
3727 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003728 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003730 output = arg;
3731 Py_INCREF(output);
3732 }
3733 else {
3734 arg = PyBytes_FromObject(arg);
3735 if (!arg)
3736 return 0;
3737 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3738 PyBytes_GET_SIZE(arg));
3739 Py_DECREF(arg);
3740 if (!output)
3741 return 0;
3742 if (!PyUnicode_Check(output)) {
3743 Py_DECREF(output);
3744 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3745 return 0;
3746 }
3747 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003748 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003749 Py_DECREF(output);
3750 return 0;
3751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003753 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003754 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3755 Py_DECREF(output);
3756 return 0;
3757 }
3758 *(PyObject**)addr = output;
3759 return Py_CLEANUP_SUPPORTED;
3760}
3761
3762
Martin v. Löwis5b222132007-06-10 09:51:05 +00003763char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003765{
Christian Heimesf3863112007-11-22 07:46:41 +00003766 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003768 if (!PyUnicode_Check(unicode)) {
3769 PyErr_BadArgument();
3770 return NULL;
3771 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003773 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003775 if (PyUnicode_UTF8(unicode) == NULL) {
3776 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3778 if (bytes == NULL)
3779 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003780 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3781 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 Py_DECREF(bytes);
3783 return NULL;
3784 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003785 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3786 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3787 PyBytes_AS_STRING(bytes),
3788 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 Py_DECREF(bytes);
3790 }
3791
3792 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003793 *psize = PyUnicode_UTF8_LENGTH(unicode);
3794 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003795}
3796
3797char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003798PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3801}
3802
3803#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003804static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805#endif
3806
3807
3808Py_UNICODE *
3809PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 const unsigned char *one_byte;
3812#if SIZEOF_WCHAR_T == 4
3813 const Py_UCS2 *two_bytes;
3814#else
3815 const Py_UCS4 *four_bytes;
3816 const Py_UCS4 *ucs4_end;
3817 Py_ssize_t num_surrogates;
3818#endif
3819 wchar_t *w;
3820 wchar_t *wchar_end;
3821
3822 if (!PyUnicode_Check(unicode)) {
3823 PyErr_BadArgument();
3824 return NULL;
3825 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003828 assert(_PyUnicode_KIND(unicode) != 0);
3829 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830
3831#ifdef Py_DEBUG
3832 ++unicode_as_unicode_calls;
3833#endif
3834
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003835 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003837 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3838 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 num_surrogates = 0;
3840
3841 for (; four_bytes < ucs4_end; ++four_bytes) {
3842 if (*four_bytes > 0xFFFF)
3843 ++num_surrogates;
3844 }
3845
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003846 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3847 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3848 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 PyErr_NoMemory();
3850 return NULL;
3851 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003852 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003854 w = _PyUnicode_WSTR(unicode);
3855 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3856 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3858 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003859 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003861 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3862 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 }
3864 else
3865 *w = *four_bytes;
3866
3867 if (w > wchar_end) {
3868 assert(0 && "Miscalculated string end");
3869 }
3870 }
3871 *w = 0;
3872#else
3873 /* sizeof(wchar_t) == 4 */
3874 Py_FatalError("Impossible unicode object state, wstr and str "
3875 "should share memory already.");
3876 return NULL;
3877#endif
3878 }
3879 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3881 (_PyUnicode_LENGTH(unicode) + 1));
3882 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 PyErr_NoMemory();
3884 return NULL;
3885 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003886 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3887 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3888 w = _PyUnicode_WSTR(unicode);
3889 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3892 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 for (; w < wchar_end; ++one_byte, ++w)
3894 *w = *one_byte;
3895 /* null-terminate the wstr */
3896 *w = 0;
3897 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003898 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 for (; w < wchar_end; ++two_bytes, ++w)
3902 *w = *two_bytes;
3903 /* null-terminate the wstr */
3904 *w = 0;
3905#else
3906 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003907 PyObject_FREE(_PyUnicode_WSTR(unicode));
3908 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 Py_FatalError("Impossible unicode object state, wstr "
3910 "and str should share memory already.");
3911 return NULL;
3912#endif
3913 }
3914 else {
3915 assert(0 && "This should never happen.");
3916 }
3917 }
3918 }
3919 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003920 *size = PyUnicode_WSTR_LENGTH(unicode);
3921 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003922}
3923
Alexander Belopolsky40018472011-02-26 01:02:56 +00003924Py_UNICODE *
3925PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003927 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928}
3929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003930
Alexander Belopolsky40018472011-02-26 01:02:56 +00003931Py_ssize_t
3932PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933{
3934 if (!PyUnicode_Check(unicode)) {
3935 PyErr_BadArgument();
3936 goto onError;
3937 }
3938 return PyUnicode_GET_SIZE(unicode);
3939
Benjamin Peterson29060642009-01-31 22:14:21 +00003940 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 return -1;
3942}
3943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944Py_ssize_t
3945PyUnicode_GetLength(PyObject *unicode)
3946{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003947 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 PyErr_BadArgument();
3949 return -1;
3950 }
3951
3952 return PyUnicode_GET_LENGTH(unicode);
3953}
3954
3955Py_UCS4
3956PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3957{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003958 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3959 PyErr_BadArgument();
3960 return (Py_UCS4)-1;
3961 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003962 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003963 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 return (Py_UCS4)-1;
3965 }
3966 return PyUnicode_READ_CHAR(unicode, index);
3967}
3968
3969int
3970PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3971{
3972 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003973 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974 return -1;
3975 }
Victor Stinner488fa492011-12-12 00:01:39 +01003976 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003977 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003978 PyErr_SetString(PyExc_IndexError, "string index out of range");
3979 return -1;
3980 }
Victor Stinner488fa492011-12-12 00:01:39 +01003981 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003982 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003983 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3984 PyErr_SetString(PyExc_ValueError, "character out of range");
3985 return -1;
3986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3988 index, ch);
3989 return 0;
3990}
3991
Alexander Belopolsky40018472011-02-26 01:02:56 +00003992const char *
3993PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003994{
Victor Stinner42cb4622010-09-01 19:39:01 +00003995 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003996}
3997
Victor Stinner554f3f02010-06-16 23:33:54 +00003998/* create or adjust a UnicodeDecodeError */
3999static void
4000make_decode_exception(PyObject **exceptionObject,
4001 const char *encoding,
4002 const char *input, Py_ssize_t length,
4003 Py_ssize_t startpos, Py_ssize_t endpos,
4004 const char *reason)
4005{
4006 if (*exceptionObject == NULL) {
4007 *exceptionObject = PyUnicodeDecodeError_Create(
4008 encoding, input, length, startpos, endpos, reason);
4009 }
4010 else {
4011 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4012 goto onError;
4013 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4014 goto onError;
4015 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4016 goto onError;
4017 }
4018 return;
4019
4020onError:
4021 Py_DECREF(*exceptionObject);
4022 *exceptionObject = NULL;
4023}
4024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025/* error handling callback helper:
4026 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004027 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 and adjust various state variables.
4029 return 0 on success, -1 on error
4030*/
4031
Alexander Belopolsky40018472011-02-26 01:02:56 +00004032static int
4033unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004034 const char *encoding, const char *reason,
4035 const char **input, const char **inend, Py_ssize_t *startinpos,
4036 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004037 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004039 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040
4041 PyObject *restuple = NULL;
4042 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004043 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004044 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004045 Py_ssize_t requiredsize;
4046 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004047 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 int res = -1;
4049
Victor Stinner596a6c42011-11-09 00:02:18 +01004050 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4051 outsize = PyUnicode_GET_LENGTH(*output);
4052 else
4053 outsize = _PyUnicode_WSTR_LENGTH(*output);
4054
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 *errorHandler = PyCodec_LookupError(errors);
4057 if (*errorHandler == NULL)
4058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 }
4060
Victor Stinner554f3f02010-06-16 23:33:54 +00004061 make_decode_exception(exceptionObject,
4062 encoding,
4063 *input, *inend - *input,
4064 *startinpos, *endinpos,
4065 reason);
4066 if (*exceptionObject == NULL)
4067 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068
4069 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4070 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004073 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 }
4076 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004078 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004079 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004080
4081 /* Copy back the bytes variables, which might have been modified by the
4082 callback */
4083 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4084 if (!inputobj)
4085 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004086 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004087 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004088 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004089 *input = PyBytes_AS_STRING(inputobj);
4090 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004091 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004092 /* we can DECREF safely, as the exception has another reference,
4093 so the object won't go away. */
4094 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004095
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004096 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004098 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004099 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4100 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004101 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102
Victor Stinner596a6c42011-11-09 00:02:18 +01004103 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4104 /* need more space? (at least enough for what we
4105 have+the replacement+the rest of the string (starting
4106 at the new input position), so we won't have to check space
4107 when there are no errors in the rest of the string) */
4108 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4109 requiredsize = *outpos + replen + insize-newpos;
4110 if (requiredsize > outsize) {
4111 if (requiredsize<2*outsize)
4112 requiredsize = 2*outsize;
4113 if (unicode_resize(output, requiredsize) < 0)
4114 goto onError;
4115 }
4116 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004117 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004118 copy_characters(*output, *outpos, repunicode, 0, replen);
4119 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004121 else {
4122 wchar_t *repwstr;
4123 Py_ssize_t repwlen;
4124 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4125 if (repwstr == NULL)
4126 goto onError;
4127 /* need more space? (at least enough for what we
4128 have+the replacement+the rest of the string (starting
4129 at the new input position), so we won't have to check space
4130 when there are no errors in the rest of the string) */
4131 requiredsize = *outpos + repwlen + insize-newpos;
4132 if (requiredsize > outsize) {
4133 if (requiredsize < 2*outsize)
4134 requiredsize = 2*outsize;
4135 if (unicode_resize(output, requiredsize) < 0)
4136 goto onError;
4137 }
4138 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4139 *outpos += repwlen;
4140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004142 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 /* we made it! */
4145 res = 0;
4146
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 Py_XDECREF(restuple);
4149 return res;
4150}
4151
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004152/* --- UTF-7 Codec -------------------------------------------------------- */
4153
Antoine Pitrou244651a2009-05-04 18:56:13 +00004154/* See RFC2152 for details. We encode conservatively and decode liberally. */
4155
4156/* Three simple macros defining base-64. */
4157
4158/* Is c a base-64 character? */
4159
4160#define IS_BASE64(c) \
4161 (((c) >= 'A' && (c) <= 'Z') || \
4162 ((c) >= 'a' && (c) <= 'z') || \
4163 ((c) >= '0' && (c) <= '9') || \
4164 (c) == '+' || (c) == '/')
4165
4166/* given that c is a base-64 character, what is its base-64 value? */
4167
4168#define FROM_BASE64(c) \
4169 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4170 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4171 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4172 (c) == '+' ? 62 : 63)
4173
4174/* What is the base-64 character of the bottom 6 bits of n? */
4175
4176#define TO_BASE64(n) \
4177 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4178
4179/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4180 * decoded as itself. We are permissive on decoding; the only ASCII
4181 * byte not decoding to itself is the + which begins a base64
4182 * string. */
4183
4184#define DECODE_DIRECT(c) \
4185 ((c) <= 127 && (c) != '+')
4186
4187/* The UTF-7 encoder treats ASCII characters differently according to
4188 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4189 * the above). See RFC2152. This array identifies these different
4190 * sets:
4191 * 0 : "Set D"
4192 * alphanumeric and '(),-./:?
4193 * 1 : "Set O"
4194 * !"#$%&*;<=>@[]^_`{|}
4195 * 2 : "whitespace"
4196 * ht nl cr sp
4197 * 3 : special (must be base64 encoded)
4198 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4199 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004200
Tim Petersced69f82003-09-16 20:30:58 +00004201static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004202char utf7_category[128] = {
4203/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4204 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4205/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4206 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4207/* sp ! " # $ % & ' ( ) * + , - . / */
4208 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4209/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4210 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4211/* @ A B C D E F G H I J K L M N O */
4212 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4213/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4214 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4215/* ` a b c d e f g h i j k l m n o */
4216 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4217/* p q r s t u v w x y z { | } ~ del */
4218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004219};
4220
Antoine Pitrou244651a2009-05-04 18:56:13 +00004221/* ENCODE_DIRECT: this character should be encoded as itself. The
4222 * answer depends on whether we are encoding set O as itself, and also
4223 * on whether we are encoding whitespace as itself. RFC2152 makes it
4224 * clear that the answers to these questions vary between
4225 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004226
Antoine Pitrou244651a2009-05-04 18:56:13 +00004227#define ENCODE_DIRECT(c, directO, directWS) \
4228 ((c) < 128 && (c) > 0 && \
4229 ((utf7_category[(c)] == 0) || \
4230 (directWS && (utf7_category[(c)] == 2)) || \
4231 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004232
Alexander Belopolsky40018472011-02-26 01:02:56 +00004233PyObject *
4234PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004235 Py_ssize_t size,
4236 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004237{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004238 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4239}
4240
Antoine Pitrou244651a2009-05-04 18:56:13 +00004241/* The decoder. The only state we preserve is our read position,
4242 * i.e. how many characters we have consumed. So if we end in the
4243 * middle of a shift sequence we have to back off the read position
4244 * and the output to the beginning of the sequence, otherwise we lose
4245 * all the shift state (seen bits, number of bits seen, high
4246 * surrogate). */
4247
Alexander Belopolsky40018472011-02-26 01:02:56 +00004248PyObject *
4249PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004250 Py_ssize_t size,
4251 const char *errors,
4252 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004253{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004255 Py_ssize_t startinpos;
4256 Py_ssize_t endinpos;
4257 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004258 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004259 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004260 const char *errmsg = "";
4261 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004262 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263 unsigned int base64bits = 0;
4264 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004265 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266 PyObject *errorHandler = NULL;
4267 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004269 /* Start off assuming it's all ASCII. Widen later as necessary. */
4270 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271 if (!unicode)
4272 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004273 if (size == 0) {
4274 if (consumed)
4275 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004276 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004277 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004278
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004279 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280 e = s + size;
4281
4282 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004283 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004285 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287 if (inShift) { /* in a base-64 section */
4288 if (IS_BASE64(ch)) { /* consume a base-64 character */
4289 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4290 base64bits += 6;
4291 s++;
4292 if (base64bits >= 16) {
4293 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004294 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 base64bits -= 16;
4296 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4297 if (surrogate) {
4298 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004299 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4300 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004301 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4302 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004304 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004305 }
4306 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004307 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4308 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310 }
4311 }
Victor Stinner551ac952011-11-29 22:58:13 +01004312 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313 /* first surrogate */
4314 surrogate = outCh;
4315 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004317 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4318 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 }
4320 }
4321 }
4322 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 inShift = 0;
4324 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004326 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4327 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004328 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004329 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004330 if (base64bits > 0) { /* left-over bits */
4331 if (base64bits >= 6) {
4332 /* We've seen at least one base-64 character */
4333 errmsg = "partial character in shift sequence";
4334 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004335 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004336 else {
4337 /* Some bits remain; they should be zero */
4338 if (base64buffer != 0) {
4339 errmsg = "non-zero padding bits in shift sequence";
4340 goto utf7Error;
4341 }
4342 }
4343 }
4344 if (ch != '-') {
4345 /* '-' is absorbed; other terminating
4346 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004347 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4348 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350 }
4351 }
4352 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 s++; /* consume '+' */
4355 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004356 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004357 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4358 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
4360 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004362 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004364 }
4365 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4368 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004369 s++;
4370 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 else {
4372 startinpos = s-starts;
4373 s++;
4374 errmsg = "unexpected special character";
4375 goto utf7Error;
4376 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004378utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379 endinpos = s-starts;
4380 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 errors, &errorHandler,
4382 "utf7", errmsg,
4383 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004384 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 }
4387
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 /* end of string */
4389
4390 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4391 /* if we're in an inconsistent state, that's an error */
4392 if (surrogate ||
4393 (base64bits >= 6) ||
4394 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395 endinpos = size;
4396 if (unicode_decode_call_errorhandler(
4397 errors, &errorHandler,
4398 "utf7", "unterminated shift sequence",
4399 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004400 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004401 goto onError;
4402 if (s < e)
4403 goto restart;
4404 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406
4407 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004408 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004410 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004411 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 }
4413 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004414 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004416 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004418 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 goto onError;
4420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 Py_XDECREF(errorHandler);
4422 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004423 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 Py_XDECREF(errorHandler);
4427 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 Py_DECREF(unicode);
4429 return NULL;
4430}
4431
4432
Alexander Belopolsky40018472011-02-26 01:02:56 +00004433PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004434_PyUnicode_EncodeUTF7(PyObject *str,
4435 int base64SetO,
4436 int base64WhiteSpace,
4437 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004439 int kind;
4440 void *data;
4441 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004442 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004443 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004445 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004446 unsigned int base64bits = 0;
4447 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448 char * out;
4449 char * start;
4450
Benjamin Petersonbac79492012-01-14 13:34:47 -05004451 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004452 return NULL;
4453 kind = PyUnicode_KIND(str);
4454 data = PyUnicode_DATA(str);
4455 len = PyUnicode_GET_LENGTH(str);
4456
4457 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004460 /* It might be possible to tighten this worst case */
4461 allocated = 8 * len;
4462 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004463 return PyErr_NoMemory();
4464
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 if (v == NULL)
4467 return NULL;
4468
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004469 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004470 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004471 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 if (inShift) {
4474 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4475 /* shifting out */
4476 if (base64bits) { /* output remaining bits */
4477 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4478 base64buffer = 0;
4479 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 }
4481 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004482 /* Characters not in the BASE64 set implicitly unshift the sequence
4483 so no '-' is required, except if the character is itself a '-' */
4484 if (IS_BASE64(ch) || ch == '-') {
4485 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 *out++ = (char) ch;
4488 }
4489 else {
4490 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004491 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 else { /* not in a shift sequence */
4494 if (ch == '+') {
4495 *out++ = '+';
4496 *out++ = '-';
4497 }
4498 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4499 *out++ = (char) ch;
4500 }
4501 else {
4502 *out++ = '+';
4503 inShift = 1;
4504 goto encode_char;
4505 }
4506 }
4507 continue;
4508encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004510 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004511
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512 /* code first surrogate */
4513 base64bits += 16;
4514 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4515 while (base64bits >= 6) {
4516 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4517 base64bits -= 6;
4518 }
4519 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004520 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004522 base64bits += 16;
4523 base64buffer = (base64buffer << 16) | ch;
4524 while (base64bits >= 6) {
4525 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4526 base64bits -= 6;
4527 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004528 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 if (base64bits)
4530 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4531 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004533 if (_PyBytes_Resize(&v, out - start) < 0)
4534 return NULL;
4535 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004537PyObject *
4538PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4539 Py_ssize_t size,
4540 int base64SetO,
4541 int base64WhiteSpace,
4542 const char *errors)
4543{
4544 PyObject *result;
4545 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4546 if (tmp == NULL)
4547 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004548 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004549 base64WhiteSpace, errors);
4550 Py_DECREF(tmp);
4551 return result;
4552}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553
Antoine Pitrou244651a2009-05-04 18:56:13 +00004554#undef IS_BASE64
4555#undef FROM_BASE64
4556#undef TO_BASE64
4557#undef DECODE_DIRECT
4558#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560/* --- UTF-8 Codec -------------------------------------------------------- */
4561
Tim Petersced69f82003-09-16 20:30:58 +00004562static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004564 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4565 illegal prefix. See RFC 3629 for details */
4566 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004568 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4570 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4571 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4572 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004573 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4574 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4576 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004577 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4578 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4579 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4580 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4581 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582};
4583
Alexander Belopolsky40018472011-02-26 01:02:56 +00004584PyObject *
4585PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004586 Py_ssize_t size,
4587 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588{
Walter Dörwald69652032004-09-07 20:24:22 +00004589 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4590}
4591
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004592#include "stringlib/ucs1lib.h"
4593#include "stringlib/codecs.h"
4594#include "stringlib/undef.h"
4595
4596#include "stringlib/ucs2lib.h"
4597#include "stringlib/codecs.h"
4598#include "stringlib/undef.h"
4599
4600#include "stringlib/ucs4lib.h"
4601#include "stringlib/codecs.h"
4602#include "stringlib/undef.h"
4603
Antoine Pitrouab868312009-01-10 15:40:25 +00004604/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4605#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4606
4607/* Mask to quickly check whether a C 'long' contains a
4608 non-ASCII, UTF8-encoded char. */
4609#if (SIZEOF_LONG == 8)
4610# define ASCII_CHAR_MASK 0x8080808080808080L
4611#elif (SIZEOF_LONG == 4)
4612# define ASCII_CHAR_MASK 0x80808080L
4613#else
4614# error C 'long' size should be either 4 or 8!
4615#endif
4616
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004617/* Scans a UTF-8 string and returns the maximum character to be expected
4618 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004619
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004620 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004621 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004622 */
4623static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004624utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004625{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004626 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004627 const unsigned char *end = p + string_size;
4628 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004629
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004630 assert(unicode_size != NULL);
4631
4632 /* By having a cascade of independent loops which fallback onto each
4633 other, we minimize the amount of work done in the average loop
4634 iteration, and we also maximize the CPU's ability to predict
4635 branches correctly (because a given condition will have always the
4636 same boolean outcome except perhaps in the last iteration of the
4637 corresponding loop).
4638 In the general case this brings us rather close to decoding
4639 performance pre-PEP 393, despite the two-pass decoding.
4640
4641 Note that the pure ASCII loop is not duplicated once a non-ASCII
4642 character has been encountered. It is actually a pessimization (by
4643 a significant factor) to use this loop on text with many non-ASCII
4644 characters, and it is important to avoid bad performance on valid
4645 utf-8 data (invalid utf-8 being a different can of worms).
4646 */
4647
4648 /* ASCII */
4649 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650 /* Only check value if it's not a ASCII char... */
4651 if (*p < 0x80) {
4652 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4653 an explanation. */
4654 if (!((size_t) p & LONG_PTR_MASK)) {
4655 /* Help register allocation */
4656 register const unsigned char *_p = p;
4657 while (_p < aligned_end) {
4658 unsigned long value = *(unsigned long *) _p;
4659 if (value & ASCII_CHAR_MASK)
4660 break;
4661 _p += SIZEOF_LONG;
4662 char_count += SIZEOF_LONG;
4663 }
4664 p = _p;
4665 if (p == end)
4666 break;
4667 }
4668 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004669 if (*p < 0x80)
4670 ++char_count;
4671 else
4672 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004674 *unicode_size = char_count;
4675 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004676
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004677_ucs1loop:
4678 for (; p < end; ++p) {
4679 if (*p < 0xc4)
4680 char_count += ((*p & 0xc0) != 0x80);
4681 else
4682 goto _ucs2loop;
4683 }
4684 *unicode_size = char_count;
4685 return 255;
4686
4687_ucs2loop:
4688 for (; p < end; ++p) {
4689 if (*p < 0xf0)
4690 char_count += ((*p & 0xc0) != 0x80);
4691 else
4692 goto _ucs4loop;
4693 }
4694 *unicode_size = char_count;
4695 return 65535;
4696
4697_ucs4loop:
4698 for (; p < end; ++p) {
4699 char_count += ((*p & 0xc0) != 0x80);
4700 }
4701 *unicode_size = char_count;
4702 return 65537;
4703}
4704
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004705/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004706 in case of errors. Implicit parameters: unicode, kind, data, onError.
4707 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004708*/
Victor Stinner785938e2011-12-11 20:09:03 +01004709#define WRITE_MAYBE_FAIL(index, value) \
4710 do { \
4711 Py_ssize_t pos = index; \
4712 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4713 unicode_resize(&unicode, pos + pos/8) < 0) \
4714 goto onError; \
4715 if (unicode_putchar(&unicode, &pos, value) < 0) \
4716 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004717 } while (0)
4718
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004719static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004720decode_utf8_errors(const char *starts,
4721 Py_ssize_t size,
4722 const char *errors,
4723 Py_ssize_t *consumed,
4724 const char *s,
4725 PyObject *unicode,
4726 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004727{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004729 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004730 Py_ssize_t startinpos;
4731 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004732 const char *e = starts + size;
4733 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004734 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 PyObject *errorHandler = NULL;
4736 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004737
Antoine Pitrouab868312009-01-10 15:40:25 +00004738 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739
4740 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004741 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742
4743 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004744 /* Fast path for runs of ASCII characters. Given that common UTF-8
4745 input will consist of an overwhelming majority of ASCII
4746 characters, we try to optimize for this case by checking
4747 as many characters as a C 'long' can contain.
4748 First, check if we can do an aligned read, as most CPUs have
4749 a penalty for unaligned reads.
4750 */
4751 if (!((size_t) s & LONG_PTR_MASK)) {
4752 /* Help register allocation */
4753 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004754 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004755 while (_s < aligned_end) {
4756 /* Read a whole long at a time (either 4 or 8 bytes),
4757 and do a fast unrolled copy if it only contains ASCII
4758 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004759 unsigned long value = *(unsigned long *) _s;
4760 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004761 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004762 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4763 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4764 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4765 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004766#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004767 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4768 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4769 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4770 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004771#endif
4772 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004774 }
4775 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004776 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004777 if (s == e)
4778 break;
4779 ch = (unsigned char)*s;
4780 }
4781 }
4782
4783 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004784 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 s++;
4786 continue;
4787 }
4788
4789 n = utf8_code_length[ch];
4790
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004791 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 if (consumed)
4793 break;
4794 else {
4795 errmsg = "unexpected end of data";
4796 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004797 endinpos = startinpos+1;
4798 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4799 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 goto utf8Error;
4801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803
4804 switch (n) {
4805
4806 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004807 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 startinpos = s-starts;
4809 endinpos = startinpos+1;
4810 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811
4812 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004813 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 startinpos = s-starts;
4815 endinpos = startinpos+1;
4816 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817
4818 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004819 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004820 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004822 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004823 goto utf8Error;
4824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004826 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004827 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828 break;
4829
4830 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004831 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4832 will result in surrogates in range d800-dfff. Surrogates are
4833 not valid UTF-8 so they are rejected.
4834 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4835 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004836 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004837 (s[2] & 0xc0) != 0x80 ||
4838 ((unsigned char)s[0] == 0xE0 &&
4839 (unsigned char)s[1] < 0xA0) ||
4840 ((unsigned char)s[0] == 0xED &&
4841 (unsigned char)s[1] > 0x9F)) {
4842 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004844 endinpos = startinpos + 1;
4845
4846 /* if s[1] first two bits are 1 and 0, then the invalid
4847 continuation byte is s[2], so increment endinpos by 1,
4848 if not, s[1] is invalid and endinpos doesn't need to
4849 be incremented. */
4850 if ((s[1] & 0xC0) == 0x80)
4851 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 goto utf8Error;
4853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004855 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004856 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004857 break;
4858
4859 case 4:
4860 if ((s[1] & 0xc0) != 0x80 ||
4861 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004862 (s[3] & 0xc0) != 0x80 ||
4863 ((unsigned char)s[0] == 0xF0 &&
4864 (unsigned char)s[1] < 0x90) ||
4865 ((unsigned char)s[0] == 0xF4 &&
4866 (unsigned char)s[1] > 0x8F)) {
4867 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004869 endinpos = startinpos + 1;
4870 if ((s[1] & 0xC0) == 0x80) {
4871 endinpos++;
4872 if ((s[2] & 0xC0) == 0x80)
4873 endinpos++;
4874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 goto utf8Error;
4876 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004877 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004878 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004879 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004880
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004881 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 }
4884 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004886
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 if (unicode_decode_call_errorhandler(
4889 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004890 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004892 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004894 /* Update data because unicode_decode_call_errorhandler might have
4895 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 }
Walter Dörwald69652032004-09-07 20:24:22 +00004898 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004901 /* Adjust length and ready string when it contained errors and
4902 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004903 if (unicode_resize(&unicode, i) < 0)
4904 goto onError;
4905 unicode_adjust_maxchar(&unicode);
4906 if (unicode == NULL)
4907 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909 Py_XDECREF(errorHandler);
4910 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004911 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004912 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915 Py_XDECREF(errorHandler);
4916 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004917 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 return NULL;
4919}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004920#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004921
Victor Stinner785938e2011-12-11 20:09:03 +01004922PyObject *
4923PyUnicode_DecodeUTF8Stateful(const char *s,
4924 Py_ssize_t size,
4925 const char *errors,
4926 Py_ssize_t *consumed)
4927{
4928 Py_UCS4 maxchar = 0;
4929 Py_ssize_t unicode_size;
4930 int has_errors = 0;
4931 PyObject *unicode;
4932 int kind;
4933 void *data;
4934 const char *starts = s;
4935 const char *e;
4936 Py_ssize_t i;
4937
4938 if (size == 0) {
4939 if (consumed)
4940 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004941 Py_INCREF(unicode_empty);
4942 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004943 }
4944
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004945 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004946
4947 /* When the string is ASCII only, just use memcpy and return.
4948 unicode_size may be != size if there is an incomplete UTF-8
4949 sequence at the end of the ASCII block. */
4950 if (maxchar < 128 && size == unicode_size) {
4951 if (consumed)
4952 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004953 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004954 }
4955
4956 unicode = PyUnicode_New(unicode_size, maxchar);
4957 if (!unicode)
4958 return NULL;
4959 kind = PyUnicode_KIND(unicode);
4960 data = PyUnicode_DATA(unicode);
4961
4962 /* Unpack UTF-8 encoded data */
4963 i = 0;
4964 e = starts + size;
4965 switch (kind) {
4966 case PyUnicode_1BYTE_KIND:
4967 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4968 break;
4969 case PyUnicode_2BYTE_KIND:
4970 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4971 break;
4972 case PyUnicode_4BYTE_KIND:
4973 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4974 break;
4975 }
4976 if (!has_errors) {
4977 /* Ensure the unicode size calculation was correct */
4978 assert(i == unicode_size);
4979 assert(s == e);
4980 if (consumed)
4981 *consumed = size;
4982 return unicode;
4983 }
4984
4985 /* In case of errors, maxchar and size computation might be incorrect;
4986 code below refits and resizes as necessary. */
4987 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4988}
4989
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004990#ifdef __APPLE__
4991
4992/* Simplified UTF-8 decoder using surrogateescape error handler,
4993 used to decode the command line arguments on Mac OS X. */
4994
4995wchar_t*
4996_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4997{
4998 int n;
4999 const char *e;
5000 wchar_t *unicode, *p;
5001
5002 /* Note: size will always be longer than the resulting Unicode
5003 character count */
5004 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5005 PyErr_NoMemory();
5006 return NULL;
5007 }
5008 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5009 if (!unicode)
5010 return NULL;
5011
5012 /* Unpack UTF-8 encoded data */
5013 p = unicode;
5014 e = s + size;
5015 while (s < e) {
5016 Py_UCS4 ch = (unsigned char)*s;
5017
5018 if (ch < 0x80) {
5019 *p++ = (wchar_t)ch;
5020 s++;
5021 continue;
5022 }
5023
5024 n = utf8_code_length[ch];
5025 if (s + n > e) {
5026 goto surrogateescape;
5027 }
5028
5029 switch (n) {
5030 case 0:
5031 case 1:
5032 goto surrogateescape;
5033
5034 case 2:
5035 if ((s[1] & 0xc0) != 0x80)
5036 goto surrogateescape;
5037 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5038 assert ((ch > 0x007F) && (ch <= 0x07FF));
5039 *p++ = (wchar_t)ch;
5040 break;
5041
5042 case 3:
5043 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5044 will result in surrogates in range d800-dfff. Surrogates are
5045 not valid UTF-8 so they are rejected.
5046 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5047 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5048 if ((s[1] & 0xc0) != 0x80 ||
5049 (s[2] & 0xc0) != 0x80 ||
5050 ((unsigned char)s[0] == 0xE0 &&
5051 (unsigned char)s[1] < 0xA0) ||
5052 ((unsigned char)s[0] == 0xED &&
5053 (unsigned char)s[1] > 0x9F)) {
5054
5055 goto surrogateescape;
5056 }
5057 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5058 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005059 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005060 break;
5061
5062 case 4:
5063 if ((s[1] & 0xc0) != 0x80 ||
5064 (s[2] & 0xc0) != 0x80 ||
5065 (s[3] & 0xc0) != 0x80 ||
5066 ((unsigned char)s[0] == 0xF0 &&
5067 (unsigned char)s[1] < 0x90) ||
5068 ((unsigned char)s[0] == 0xF4 &&
5069 (unsigned char)s[1] > 0x8F)) {
5070 goto surrogateescape;
5071 }
5072 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5073 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005074 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005075
5076#if SIZEOF_WCHAR_T == 4
5077 *p++ = (wchar_t)ch;
5078#else
5079 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005080 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5081 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005082#endif
5083 break;
5084 }
5085 s += n;
5086 continue;
5087
5088 surrogateescape:
5089 *p++ = 0xDC00 + ch;
5090 s++;
5091 }
5092 *p = L'\0';
5093 return unicode;
5094}
5095
5096#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005098/* Primary internal function which creates utf8 encoded bytes objects.
5099
5100 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005101 and allocate exactly as much space needed at the end. Else allocate the
5102 maximum possible needed (4 result bytes per Unicode character), and return
5103 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005104*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005105PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005106_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107{
Victor Stinner6099a032011-12-18 14:22:26 +01005108 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109 void *data;
5110 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005112 if (!PyUnicode_Check(unicode)) {
5113 PyErr_BadArgument();
5114 return NULL;
5115 }
5116
5117 if (PyUnicode_READY(unicode) == -1)
5118 return NULL;
5119
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005120 if (PyUnicode_UTF8(unicode))
5121 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5122 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123
5124 kind = PyUnicode_KIND(unicode);
5125 data = PyUnicode_DATA(unicode);
5126 size = PyUnicode_GET_LENGTH(unicode);
5127
Benjamin Petersonead6b532011-12-20 17:23:42 -06005128 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005129 default:
5130 assert(0);
5131 case PyUnicode_1BYTE_KIND:
5132 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5133 assert(!PyUnicode_IS_ASCII(unicode));
5134 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5135 case PyUnicode_2BYTE_KIND:
5136 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5137 case PyUnicode_4BYTE_KIND:
5138 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140}
5141
Alexander Belopolsky40018472011-02-26 01:02:56 +00005142PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005143PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5144 Py_ssize_t size,
5145 const char *errors)
5146{
5147 PyObject *v, *unicode;
5148
5149 unicode = PyUnicode_FromUnicode(s, size);
5150 if (unicode == NULL)
5151 return NULL;
5152 v = _PyUnicode_AsUTF8String(unicode, errors);
5153 Py_DECREF(unicode);
5154 return v;
5155}
5156
5157PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005158PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005160 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161}
5162
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163/* --- UTF-32 Codec ------------------------------------------------------- */
5164
5165PyObject *
5166PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 Py_ssize_t size,
5168 const char *errors,
5169 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005170{
5171 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5172}
5173
5174PyObject *
5175PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 Py_ssize_t size,
5177 const char *errors,
5178 int *byteorder,
5179 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005180{
5181 const char *starts = s;
5182 Py_ssize_t startinpos;
5183 Py_ssize_t endinpos;
5184 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005185 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005186 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005187 int bo = 0; /* assume native ordering by default */
5188 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005189 /* Offsets from q for retrieving bytes in the right order. */
5190#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5191 int iorder[] = {0, 1, 2, 3};
5192#else
5193 int iorder[] = {3, 2, 1, 0};
5194#endif
5195 PyObject *errorHandler = NULL;
5196 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005197
Walter Dörwald41980ca2007-08-16 21:55:45 +00005198 q = (unsigned char *)s;
5199 e = q + size;
5200
5201 if (byteorder)
5202 bo = *byteorder;
5203
5204 /* Check for BOM marks (U+FEFF) in the input and adjust current
5205 byte order setting accordingly. In native mode, the leading BOM
5206 mark is skipped, in all other modes, it is copied to the output
5207 stream as-is (giving a ZWNBSP character). */
5208 if (bo == 0) {
5209 if (size >= 4) {
5210 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005212#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 if (bom == 0x0000FEFF) {
5214 q += 4;
5215 bo = -1;
5216 }
5217 else if (bom == 0xFFFE0000) {
5218 q += 4;
5219 bo = 1;
5220 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005221#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 if (bom == 0x0000FEFF) {
5223 q += 4;
5224 bo = 1;
5225 }
5226 else if (bom == 0xFFFE0000) {
5227 q += 4;
5228 bo = -1;
5229 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 }
5233
5234 if (bo == -1) {
5235 /* force LE */
5236 iorder[0] = 0;
5237 iorder[1] = 1;
5238 iorder[2] = 2;
5239 iorder[3] = 3;
5240 }
5241 else if (bo == 1) {
5242 /* force BE */
5243 iorder[0] = 3;
5244 iorder[1] = 2;
5245 iorder[2] = 1;
5246 iorder[3] = 0;
5247 }
5248
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005249 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005250 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005251 if (!unicode)
5252 return NULL;
5253 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005254 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005255 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005256
Walter Dörwald41980ca2007-08-16 21:55:45 +00005257 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 Py_UCS4 ch;
5259 /* remaining bytes at the end? (size should be divisible by 4) */
5260 if (e-q<4) {
5261 if (consumed)
5262 break;
5263 errmsg = "truncated data";
5264 startinpos = ((const char *)q)-starts;
5265 endinpos = ((const char *)e)-starts;
5266 goto utf32Error;
5267 /* The remaining input chars are ignored if the callback
5268 chooses to skip the input */
5269 }
5270 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5271 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005272
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 if (ch >= 0x110000)
5274 {
5275 errmsg = "codepoint not in range(0x110000)";
5276 startinpos = ((const char *)q)-starts;
5277 endinpos = startinpos+4;
5278 goto utf32Error;
5279 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005280 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5281 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 q += 4;
5283 continue;
5284 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 if (unicode_decode_call_errorhandler(
5286 errors, &errorHandler,
5287 "utf32", errmsg,
5288 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005289 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005291 }
5292
5293 if (byteorder)
5294 *byteorder = bo;
5295
5296 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005298
5299 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005300 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005301 goto onError;
5302
5303 Py_XDECREF(errorHandler);
5304 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005305 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005306
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005308 Py_DECREF(unicode);
5309 Py_XDECREF(errorHandler);
5310 Py_XDECREF(exc);
5311 return NULL;
5312}
5313
5314PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005315_PyUnicode_EncodeUTF32(PyObject *str,
5316 const char *errors,
5317 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005318{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005319 int kind;
5320 void *data;
5321 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005322 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005323 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005324 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005325 /* Offsets from p for storing byte pairs in the right order. */
5326#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5327 int iorder[] = {0, 1, 2, 3};
5328#else
5329 int iorder[] = {3, 2, 1, 0};
5330#endif
5331
Benjamin Peterson29060642009-01-31 22:14:21 +00005332#define STORECHAR(CH) \
5333 do { \
5334 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5335 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5336 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5337 p[iorder[0]] = (CH) & 0xff; \
5338 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005339 } while(0)
5340
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005341 if (!PyUnicode_Check(str)) {
5342 PyErr_BadArgument();
5343 return NULL;
5344 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005345 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005346 return NULL;
5347 kind = PyUnicode_KIND(str);
5348 data = PyUnicode_DATA(str);
5349 len = PyUnicode_GET_LENGTH(str);
5350
5351 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005352 bytesize = nsize * 4;
5353 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005355 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005356 if (v == NULL)
5357 return NULL;
5358
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005359 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005362 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005363 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364
5365 if (byteorder == -1) {
5366 /* force LE */
5367 iorder[0] = 0;
5368 iorder[1] = 1;
5369 iorder[2] = 2;
5370 iorder[3] = 3;
5371 }
5372 else if (byteorder == 1) {
5373 /* force BE */
5374 iorder[0] = 3;
5375 iorder[1] = 2;
5376 iorder[2] = 1;
5377 iorder[3] = 0;
5378 }
5379
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005380 for (i = 0; i < len; i++)
5381 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005382
5383 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005384 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005385#undef STORECHAR
5386}
5387
Alexander Belopolsky40018472011-02-26 01:02:56 +00005388PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5390 Py_ssize_t size,
5391 const char *errors,
5392 int byteorder)
5393{
5394 PyObject *result;
5395 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5396 if (tmp == NULL)
5397 return NULL;
5398 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5399 Py_DECREF(tmp);
5400 return result;
5401}
5402
5403PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005404PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005405{
Victor Stinnerb960b342011-11-20 19:12:52 +01005406 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005407}
5408
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409/* --- UTF-16 Codec ------------------------------------------------------- */
5410
Tim Peters772747b2001-08-09 22:21:55 +00005411PyObject *
5412PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 Py_ssize_t size,
5414 const char *errors,
5415 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416{
Walter Dörwald69652032004-09-07 20:24:22 +00005417 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5418}
5419
Antoine Pitrouab868312009-01-10 15:40:25 +00005420/* Two masks for fast checking of whether a C 'long' may contain
5421 UTF16-encoded surrogate characters. This is an efficient heuristic,
5422 assuming that non-surrogate characters with a code point >= 0x8000 are
5423 rare in most input.
5424 FAST_CHAR_MASK is used when the input is in native byte ordering,
5425 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005426*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005427#if (SIZEOF_LONG == 8)
5428# define FAST_CHAR_MASK 0x8000800080008000L
5429# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005430# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005431#elif (SIZEOF_LONG == 4)
5432# define FAST_CHAR_MASK 0x80008000L
5433# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005434# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005435#else
5436# error C 'long' size should be either 4 or 8!
5437#endif
5438
Walter Dörwald69652032004-09-07 20:24:22 +00005439PyObject *
5440PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 Py_ssize_t size,
5442 const char *errors,
5443 int *byteorder,
5444 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005445{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005447 Py_ssize_t startinpos;
5448 Py_ssize_t endinpos;
5449 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005450 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005451 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005452 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005453 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005454 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005455 /* Offsets from q for retrieving byte pairs in the right order. */
5456#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5457 int ihi = 1, ilo = 0;
5458#else
5459 int ihi = 0, ilo = 1;
5460#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005461 PyObject *errorHandler = NULL;
5462 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463
5464 /* Note: size will always be longer than the resulting Unicode
5465 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005466 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 if (!unicode)
5468 return NULL;
5469 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005470 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005471 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472
Tim Peters772747b2001-08-09 22:21:55 +00005473 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005474 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
5476 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005477 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005479 /* Check for BOM marks (U+FEFF) in the input and adjust current
5480 byte order setting accordingly. In native mode, the leading BOM
5481 mark is skipped, in all other modes, it is copied to the output
5482 stream as-is (giving a ZWNBSP character). */
5483 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005484 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005485 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 if (bom == 0xFEFF) {
5488 q += 2;
5489 bo = -1;
5490 }
5491 else if (bom == 0xFFFE) {
5492 q += 2;
5493 bo = 1;
5494 }
Tim Petersced69f82003-09-16 20:30:58 +00005495#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 if (bom == 0xFEFF) {
5497 q += 2;
5498 bo = 1;
5499 }
5500 else if (bom == 0xFFFE) {
5501 q += 2;
5502 bo = -1;
5503 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005504#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507
Tim Peters772747b2001-08-09 22:21:55 +00005508 if (bo == -1) {
5509 /* force LE */
5510 ihi = 1;
5511 ilo = 0;
5512 }
5513 else if (bo == 1) {
5514 /* force BE */
5515 ihi = 0;
5516 ilo = 1;
5517 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005518#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5519 native_ordering = ilo < ihi;
5520#else
5521 native_ordering = ilo > ihi;
5522#endif
Tim Peters772747b2001-08-09 22:21:55 +00005523
Antoine Pitrouab868312009-01-10 15:40:25 +00005524 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005525 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005526 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005527 /* First check for possible aligned read of a C 'long'. Unaligned
5528 reads are more expensive, better to defer to another iteration. */
5529 if (!((size_t) q & LONG_PTR_MASK)) {
5530 /* Fast path for runs of non-surrogate chars. */
5531 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005532 int kind = PyUnicode_KIND(unicode);
5533 void *data = PyUnicode_DATA(unicode);
5534 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005535 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005536 Py_UCS4 maxch;
5537 if (native_ordering) {
5538 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005539 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005540 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005541 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005542 else {
5543 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005544 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005545 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005546 block = ((block >> 8) & STRIPPED_MASK) |
5547 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005548 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005549 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005550#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005551 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
5552 maxch = Py_MAX(maxch, ch);
5553 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
5554 maxch = Py_MAX(maxch, ch);
5555 ch = (Py_UCS2)(block >> 48);
5556 maxch = Py_MAX(maxch, ch);
5557#else
5558 ch = (Py_UCS2)(block >> 16);
5559 maxch = Py_MAX(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005560#endif
5561 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5562 if (unicode_widen(&unicode, maxch) < 0)
5563 goto onError;
5564 kind = PyUnicode_KIND(unicode);
5565 data = PyUnicode_DATA(unicode);
5566 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005567#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5568 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005569#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005570 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5571 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5572 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5573#else
5574 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5575#endif
5576#else
5577#if SIZEOF_LONG == 8
5578 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5579 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5580 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5581#else
5582 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5583#endif
5584 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005585#endif
5586 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005587 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005588 q = _q;
5589 if (q >= e)
5590 break;
5591 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005593
Benjamin Peterson14339b62009-01-31 16:36:08 +00005594 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005595
Victor Stinner551ac952011-11-29 22:58:13 +01005596 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005597 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5598 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 continue;
5600 }
5601
5602 /* UTF-16 code pair: */
5603 if (q > e) {
5604 errmsg = "unexpected end of data";
5605 startinpos = (((const char *)q) - 2) - starts;
5606 endinpos = ((const char *)e) + 1 - starts;
5607 goto utf16Error;
5608 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005609 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5610 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005612 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005613 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005614 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005615 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 continue;
5617 }
5618 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005619 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 startinpos = (((const char *)q)-4)-starts;
5621 endinpos = startinpos+2;
5622 goto utf16Error;
5623 }
5624
Benjamin Peterson14339b62009-01-31 16:36:08 +00005625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 errmsg = "illegal encoding";
5627 startinpos = (((const char *)q)-2)-starts;
5628 endinpos = startinpos+2;
5629 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005630
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005633 errors,
5634 &errorHandler,
5635 "utf16", errmsg,
5636 &starts,
5637 (const char **)&e,
5638 &startinpos,
5639 &endinpos,
5640 &exc,
5641 (const char **)&q,
5642 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005643 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005644 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005646 /* remaining byte at the end? (size should be even) */
5647 if (e == q) {
5648 if (!consumed) {
5649 errmsg = "truncated data";
5650 startinpos = ((const char *)q) - starts;
5651 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005652 if (unicode_decode_call_errorhandler(
5653 errors,
5654 &errorHandler,
5655 "utf16", errmsg,
5656 &starts,
5657 (const char **)&e,
5658 &startinpos,
5659 &endinpos,
5660 &exc,
5661 (const char **)&q,
5662 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005663 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005664 goto onError;
5665 /* The remaining input chars are ignored if the callback
5666 chooses to skip the input */
5667 }
5668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669
5670 if (byteorder)
5671 *byteorder = bo;
5672
Walter Dörwald69652032004-09-07 20:24:22 +00005673 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005675
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005677 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 goto onError;
5679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 Py_XDECREF(errorHandler);
5681 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005682 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 Py_XDECREF(errorHandler);
5687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 return NULL;
5689}
5690
Antoine Pitrouab868312009-01-10 15:40:25 +00005691#undef FAST_CHAR_MASK
5692#undef SWAPPED_FAST_CHAR_MASK
5693
Tim Peters772747b2001-08-09 22:21:55 +00005694PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005695_PyUnicode_EncodeUTF16(PyObject *str,
5696 const char *errors,
5697 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005699 int kind;
5700 void *data;
5701 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005702 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005703 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005704 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005705 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005706 /* Offsets from p for storing byte pairs in the right order. */
5707#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5708 int ihi = 1, ilo = 0;
5709#else
5710 int ihi = 0, ilo = 1;
5711#endif
5712
Benjamin Peterson29060642009-01-31 22:14:21 +00005713#define STORECHAR(CH) \
5714 do { \
5715 p[ihi] = ((CH) >> 8) & 0xff; \
5716 p[ilo] = (CH) & 0xff; \
5717 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005718 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005720 if (!PyUnicode_Check(str)) {
5721 PyErr_BadArgument();
5722 return NULL;
5723 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005724 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005725 return NULL;
5726 kind = PyUnicode_KIND(str);
5727 data = PyUnicode_DATA(str);
5728 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 pairs = 0;
5731 if (kind == PyUnicode_4BYTE_KIND)
5732 for (i = 0; i < len; i++)
5733 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5734 pairs++;
5735 /* 2 * (len + pairs + (byteorder == 0)) */
5736 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005738 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005739 bytesize = nsize * 2;
5740 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005742 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 if (v == NULL)
5744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005746 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005749 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005750 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005751
5752 if (byteorder == -1) {
5753 /* force LE */
5754 ihi = 1;
5755 ilo = 0;
5756 }
5757 else if (byteorder == 1) {
5758 /* force BE */
5759 ihi = 0;
5760 ilo = 1;
5761 }
5762
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005763 for (i = 0; i < len; i++) {
5764 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5765 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005767 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5768 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 }
Tim Peters772747b2001-08-09 22:21:55 +00005770 STORECHAR(ch);
5771 if (ch2)
5772 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005773 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005774
5775 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005776 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005777#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778}
5779
Alexander Belopolsky40018472011-02-26 01:02:56 +00005780PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005781PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5782 Py_ssize_t size,
5783 const char *errors,
5784 int byteorder)
5785{
5786 PyObject *result;
5787 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5788 if (tmp == NULL)
5789 return NULL;
5790 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5791 Py_DECREF(tmp);
5792 return result;
5793}
5794
5795PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005796PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005798 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799}
5800
5801/* --- Unicode Escape Codec ----------------------------------------------- */
5802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005803/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5804 if all the escapes in the string make it still a valid ASCII string.
5805 Returns -1 if any escapes were found which cause the string to
5806 pop out of ASCII range. Otherwise returns the length of the
5807 required buffer to hold the string.
5808 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005809static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005810length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5811{
5812 const unsigned char *p = (const unsigned char *)s;
5813 const unsigned char *end = p + size;
5814 Py_ssize_t length = 0;
5815
5816 if (size < 0)
5817 return -1;
5818
5819 for (; p < end; ++p) {
5820 if (*p > 127) {
5821 /* Non-ASCII */
5822 return -1;
5823 }
5824 else if (*p != '\\') {
5825 /* Normal character */
5826 ++length;
5827 }
5828 else {
5829 /* Backslash-escape, check next char */
5830 ++p;
5831 /* Escape sequence reaches till end of string or
5832 non-ASCII follow-up. */
5833 if (p >= end || *p > 127)
5834 return -1;
5835 switch (*p) {
5836 case '\n':
5837 /* backslash + \n result in zero characters */
5838 break;
5839 case '\\': case '\'': case '\"':
5840 case 'b': case 'f': case 't':
5841 case 'n': case 'r': case 'v': case 'a':
5842 ++length;
5843 break;
5844 case '0': case '1': case '2': case '3':
5845 case '4': case '5': case '6': case '7':
5846 case 'x': case 'u': case 'U': case 'N':
5847 /* these do not guarantee ASCII characters */
5848 return -1;
5849 default:
5850 /* count the backslash + the other character */
5851 length += 2;
5852 }
5853 }
5854 }
5855 return length;
5856}
5857
Fredrik Lundh06d12682001-01-24 07:59:11 +00005858static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005859
Alexander Belopolsky40018472011-02-26 01:02:56 +00005860PyObject *
5861PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005862 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005863 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005865 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t startinpos;
5867 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005868 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005869 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005871 char* message;
5872 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873 PyObject *errorHandler = NULL;
5874 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005875 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005876 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005877
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005878 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005879
5880 /* After length_of_escaped_ascii_string() there are two alternatives,
5881 either the string is pure ASCII with named escapes like \n, etc.
5882 and we determined it's exact size (common case)
5883 or it contains \x, \u, ... escape sequences. then we create a
5884 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005885 if (len >= 0) {
5886 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887 if (!v)
5888 goto onError;
5889 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005890 }
5891 else {
5892 /* Escaped strings will always be longer than the resulting
5893 Unicode string, so we start with size here and then reduce the
5894 length after conversion to the true value.
5895 (but if the error callback returns a long replacement string
5896 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005897 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005898 if (!v)
5899 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005900 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005901 }
5902
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005904 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005905 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005907
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 while (s < end) {
5909 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005910 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005913 /* The only case in which i == ascii_length is a backslash
5914 followed by a newline. */
5915 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005916
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 /* Non-escape characters are interpreted as Unicode ordinals */
5918 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005919 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5920 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 continue;
5922 }
5923
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 /* \ - Escapes */
5926 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005927 c = *s++;
5928 if (s > end)
5929 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005930
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005931 /* The only case in which i == ascii_length is a backslash
5932 followed by a newline. */
5933 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005935 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005938#define WRITECHAR(ch) \
5939 do { \
5940 if (unicode_putchar(&v, &i, ch) < 0) \
5941 goto onError; \
5942 }while(0)
5943
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005945 case '\\': WRITECHAR('\\'); break;
5946 case '\'': WRITECHAR('\''); break;
5947 case '\"': WRITECHAR('\"'); break;
5948 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005949 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005950 case 'f': WRITECHAR('\014'); break;
5951 case 't': WRITECHAR('\t'); break;
5952 case 'n': WRITECHAR('\n'); break;
5953 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005954 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005955 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005956 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005957 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 case '0': case '1': case '2': case '3':
5961 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005962 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005963 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005964 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005965 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005966 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005968 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 break;
5970
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 /* hex escapes */
5972 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005974 digits = 2;
5975 message = "truncated \\xXX escape";
5976 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005980 digits = 4;
5981 message = "truncated \\uXXXX escape";
5982 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005985 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005986 digits = 8;
5987 message = "truncated \\UXXXXXXXX escape";
5988 hexescape:
5989 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005990 if (s+digits>end) {
5991 endinpos = size;
5992 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 errors, &errorHandler,
5994 "unicodeescape", "end of string in escape sequence",
5995 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005996 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997 goto onError;
5998 goto nextByte;
5999 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006000 for (j = 0; j < digits; ++j) {
6001 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006002 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006003 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 errors, &errorHandler,
6006 "unicodeescape", message,
6007 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006008 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006009 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006010 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006012 }
6013 chr = (chr<<4) & ~0xF;
6014 if (c >= '0' && c <= '9')
6015 chr += c - '0';
6016 else if (c >= 'a' && c <= 'f')
6017 chr += 10 + c - 'a';
6018 else
6019 chr += 10 + c - 'A';
6020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006021 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006022 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 /* _decoding_error will have already written into the
6024 target buffer. */
6025 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006026 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006027 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006028 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006029 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006030 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006031 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 errors, &errorHandler,
6034 "unicodeescape", "illegal Unicode character",
6035 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006036 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006037 goto onError;
6038 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006039 break;
6040
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006042 case 'N':
6043 message = "malformed \\N character escape";
6044 if (ucnhash_CAPI == NULL) {
6045 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006046 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6047 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006048 if (ucnhash_CAPI == NULL)
6049 goto ucnhashError;
6050 }
6051 if (*s == '{') {
6052 const char *start = s+1;
6053 /* look for the closing brace */
6054 while (*s != '}' && s < end)
6055 s++;
6056 if (s > start && s < end && *s == '}') {
6057 /* found a name. look it up in the unicode database */
6058 message = "unknown Unicode character name";
6059 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006060 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006061 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 goto store;
6063 }
6064 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 errors, &errorHandler,
6068 "unicodeescape", message,
6069 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006070 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 break;
6073
6074 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006075 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 message = "\\ at end of string";
6077 s--;
6078 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 errors, &errorHandler,
6081 "unicodeescape", message,
6082 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006083 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006084 goto onError;
6085 }
6086 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006087 WRITECHAR('\\');
6088 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006089 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006090 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006095#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006096
Victor Stinner16e6a802011-12-12 13:24:15 +01006097 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006099 Py_XDECREF(errorHandler);
6100 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006101 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006102
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006104 PyErr_SetString(
6105 PyExc_UnicodeError,
6106 "\\N escapes not supported (can't load unicodedata module)"
6107 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006108 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006109 Py_XDECREF(errorHandler);
6110 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006111 return NULL;
6112
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115 Py_XDECREF(errorHandler);
6116 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 return NULL;
6118}
6119
6120/* Return a Unicode-Escape string version of the Unicode object.
6121
6122 If quotes is true, the string is enclosed in u"" or u'' quotes as
6123 appropriate.
6124
6125*/
6126
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006128PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006130 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006131 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006133 int kind;
6134 void *data;
6135 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136
Thomas Wouters89f507f2006-12-13 04:49:30 +00006137 /* Initial allocation is based on the longest-possible unichr
6138 escape.
6139
6140 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6141 unichr, so in this case it's the longest unichr escape. In
6142 narrow (UTF-16) builds this is five chars per source unichr
6143 since there are two unichrs in the surrogate pair, so in narrow
6144 (UTF-16) builds it's not the longest unichr escape.
6145
6146 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6147 so in the narrow (UTF-16) build case it's the longest unichr
6148 escape.
6149 */
6150
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151 if (!PyUnicode_Check(unicode)) {
6152 PyErr_BadArgument();
6153 return NULL;
6154 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006155 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 return NULL;
6157 len = PyUnicode_GET_LENGTH(unicode);
6158 kind = PyUnicode_KIND(unicode);
6159 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006160 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006161 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6162 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6163 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6164 }
6165
6166 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006167 return PyBytes_FromStringAndSize(NULL, 0);
6168
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006171
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006172 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 if (repr == NULL)
6177 return NULL;
6178
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006179 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006182 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006183
Walter Dörwald79e913e2007-05-12 11:08:06 +00006184 /* Escape backslashes */
6185 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 *p++ = '\\';
6187 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006188 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006189 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006190
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006191 /* Map 21-bit characters to '\U00xxxxxx' */
6192 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006193 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006194 *p++ = '\\';
6195 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006196 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6197 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6198 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6199 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6200 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6201 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6202 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6203 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006205 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006206
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006208 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 *p++ = '\\';
6210 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006211 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6212 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6213 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6214 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006216
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006217 /* Map special whitespace to '\t', \n', '\r' */
6218 else if (ch == '\t') {
6219 *p++ = '\\';
6220 *p++ = 't';
6221 }
6222 else if (ch == '\n') {
6223 *p++ = '\\';
6224 *p++ = 'n';
6225 }
6226 else if (ch == '\r') {
6227 *p++ = '\\';
6228 *p++ = 'r';
6229 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006230
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006231 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006232 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006234 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006235 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6236 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006237 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006238
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 /* Copy everything else as-is */
6240 else
6241 *p++ = (char) ch;
6242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006244 assert(p - PyBytes_AS_STRING(repr) > 0);
6245 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6246 return NULL;
6247 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248}
6249
Alexander Belopolsky40018472011-02-26 01:02:56 +00006250PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006251PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6252 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006254 PyObject *result;
6255 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6256 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006258 result = PyUnicode_AsUnicodeEscapeString(tmp);
6259 Py_DECREF(tmp);
6260 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261}
6262
6263/* --- Raw Unicode Escape Codec ------------------------------------------- */
6264
Alexander Belopolsky40018472011-02-26 01:02:56 +00006265PyObject *
6266PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006267 Py_ssize_t size,
6268 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006270 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006271 Py_ssize_t startinpos;
6272 Py_ssize_t endinpos;
6273 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006274 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 const char *end;
6276 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277 PyObject *errorHandler = NULL;
6278 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006279
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 /* Escaped strings will always be longer than the resulting
6281 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006282 length after conversion to the true value. (But decoding error
6283 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006284 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006288 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006289 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 end = s + size;
6291 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 unsigned char c;
6293 Py_UCS4 x;
6294 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006295 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 /* Non-escape characters are interpreted as Unicode ordinals */
6298 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006299 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6300 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006302 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 startinpos = s-starts;
6304
6305 /* \u-escapes are only interpreted iff the number of leading
6306 backslashes if odd */
6307 bs = s;
6308 for (;s < end;) {
6309 if (*s != '\\')
6310 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006311 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6312 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 }
6314 if (((s - bs) & 1) == 0 ||
6315 s >= end ||
6316 (*s != 'u' && *s != 'U')) {
6317 continue;
6318 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006319 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 count = *s=='u' ? 4 : 8;
6321 s++;
6322
6323 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 for (x = 0, i = 0; i < count; ++i, ++s) {
6325 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006326 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 endinpos = s-starts;
6328 if (unicode_decode_call_errorhandler(
6329 errors, &errorHandler,
6330 "rawunicodeescape", "truncated \\uXXXX",
6331 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006332 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 goto onError;
6334 goto nextByte;
6335 }
6336 x = (x<<4) & ~0xF;
6337 if (c >= '0' && c <= '9')
6338 x += c - '0';
6339 else if (c >= 'a' && c <= 'f')
6340 x += 10 + c - 'a';
6341 else
6342 x += 10 + c - 'A';
6343 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006344 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006345 if (unicode_putchar(&v, &outpos, x) < 0)
6346 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006347 } else {
6348 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006349 if (unicode_decode_call_errorhandler(
6350 errors, &errorHandler,
6351 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006353 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006355 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 nextByte:
6357 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006359 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361 Py_XDECREF(errorHandler);
6362 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006363 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006364
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367 Py_XDECREF(errorHandler);
6368 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 return NULL;
6370}
6371
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006372
Alexander Belopolsky40018472011-02-26 01:02:56 +00006373PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006374PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006376 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 char *p;
6378 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006379 Py_ssize_t expandsize, pos;
6380 int kind;
6381 void *data;
6382 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006384 if (!PyUnicode_Check(unicode)) {
6385 PyErr_BadArgument();
6386 return NULL;
6387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006388 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006389 return NULL;
6390 kind = PyUnicode_KIND(unicode);
6391 data = PyUnicode_DATA(unicode);
6392 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006393 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6394 bytes, and 1 byte characters 4. */
6395 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006396
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006397 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006399
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006400 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 if (repr == NULL)
6402 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006404 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006406 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407 for (pos = 0; pos < len; pos++) {
6408 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 /* Map 32-bit characters to '\Uxxxxxxxx' */
6410 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006411 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006412 *p++ = '\\';
6413 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006414 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6415 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6416 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6417 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6418 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6419 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6420 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6421 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006422 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 *p++ = '\\';
6426 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006427 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6428 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6429 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6430 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 /* Copy everything else as-is */
6433 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 *p++ = (char) ch;
6435 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006436
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006437 assert(p > q);
6438 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006439 return NULL;
6440 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441}
6442
Alexander Belopolsky40018472011-02-26 01:02:56 +00006443PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006444PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6445 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006447 PyObject *result;
6448 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6449 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006450 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006451 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6452 Py_DECREF(tmp);
6453 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454}
6455
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006456/* --- Unicode Internal Codec ------------------------------------------- */
6457
Alexander Belopolsky40018472011-02-26 01:02:56 +00006458PyObject *
6459_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006460 Py_ssize_t size,
6461 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006462{
6463 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006464 Py_ssize_t startinpos;
6465 Py_ssize_t endinpos;
6466 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006467 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006468 const char *end;
6469 const char *reason;
6470 PyObject *errorHandler = NULL;
6471 PyObject *exc = NULL;
6472
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006473 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006474 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006475 1))
6476 return NULL;
6477
Thomas Wouters89f507f2006-12-13 04:49:30 +00006478 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006479 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006480 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006482 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006483 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006484 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006485 end = s + size;
6486
6487 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006488 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006489 Py_UCS4 ch;
6490 /* We copy the raw representation one byte at a time because the
6491 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006492 ((char *) &uch)[0] = s[0];
6493 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006494#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006495 ((char *) &uch)[2] = s[2];
6496 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006497#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006498 ch = uch;
6499
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006500 /* We have to sanity check the raw data, otherwise doom looms for
6501 some malformed UCS-4 data. */
6502 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006503#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006504 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006505#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006506 end-s < Py_UNICODE_SIZE
6507 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006509 startinpos = s - starts;
6510 if (end-s < Py_UNICODE_SIZE) {
6511 endinpos = end-starts;
6512 reason = "truncated input";
6513 }
6514 else {
6515 endinpos = s - starts + Py_UNICODE_SIZE;
6516 reason = "illegal code point (> 0x10FFFF)";
6517 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006518 if (unicode_decode_call_errorhandler(
6519 errors, &errorHandler,
6520 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006521 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006522 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006523 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006524 continue;
6525 }
6526
6527 s += Py_UNICODE_SIZE;
6528#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006529 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006530 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006531 Py_UNICODE uch2;
6532 ((char *) &uch2)[0] = s[0];
6533 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006534 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006535 {
Victor Stinner551ac952011-11-29 22:58:13 +01006536 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006537 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006538 }
6539 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006540#endif
6541
6542 if (unicode_putchar(&v, &outpos, ch) < 0)
6543 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006544 }
6545
Victor Stinner16e6a802011-12-12 13:24:15 +01006546 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006547 goto onError;
6548 Py_XDECREF(errorHandler);
6549 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006550 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006551
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006553 Py_XDECREF(v);
6554 Py_XDECREF(errorHandler);
6555 Py_XDECREF(exc);
6556 return NULL;
6557}
6558
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559/* --- Latin-1 Codec ------------------------------------------------------ */
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561PyObject *
6562PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006563 Py_ssize_t size,
6564 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006567 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568}
6569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006570/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006571static void
6572make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006573 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006574 PyObject *unicode,
6575 Py_ssize_t startpos, Py_ssize_t endpos,
6576 const char *reason)
6577{
6578 if (*exceptionObject == NULL) {
6579 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006580 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006581 encoding, unicode, startpos, endpos, reason);
6582 }
6583 else {
6584 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6585 goto onError;
6586 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6587 goto onError;
6588 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6589 goto onError;
6590 return;
6591 onError:
6592 Py_DECREF(*exceptionObject);
6593 *exceptionObject = NULL;
6594 }
6595}
6596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006598static void
6599raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006600 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006601 PyObject *unicode,
6602 Py_ssize_t startpos, Py_ssize_t endpos,
6603 const char *reason)
6604{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006605 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006606 encoding, unicode, startpos, endpos, reason);
6607 if (*exceptionObject != NULL)
6608 PyCodec_StrictErrors(*exceptionObject);
6609}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610
6611/* error handling callback helper:
6612 build arguments, call the callback and check the arguments,
6613 put the result into newpos and return the replacement string, which
6614 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615static PyObject *
6616unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006617 PyObject **errorHandler,
6618 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006620 Py_ssize_t startpos, Py_ssize_t endpos,
6621 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006623 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625 PyObject *restuple;
6626 PyObject *resunicode;
6627
6628 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006630 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 }
6633
Benjamin Petersonbac79492012-01-14 13:34:47 -05006634 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006635 return NULL;
6636 len = PyUnicode_GET_LENGTH(unicode);
6637
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006638 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006639 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006642
6643 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006644 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006645 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006648 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 Py_DECREF(restuple);
6650 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006652 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 &resunicode, newpos)) {
6654 Py_DECREF(restuple);
6655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006657 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6658 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6659 Py_DECREF(restuple);
6660 return NULL;
6661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 *newpos = len + *newpos;
6664 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6666 Py_DECREF(restuple);
6667 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006668 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 Py_INCREF(resunicode);
6670 Py_DECREF(restuple);
6671 return resunicode;
6672}
6673
Alexander Belopolsky40018472011-02-26 01:02:56 +00006674static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006675unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006676 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006677 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006678{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 /* input state */
6680 Py_ssize_t pos=0, size;
6681 int kind;
6682 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 /* output object */
6684 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 /* pointer into the output */
6686 char *str;
6687 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006688 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006689 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6690 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 PyObject *errorHandler = NULL;
6692 PyObject *exc = NULL;
6693 /* the following variable is used for caching string comparisons
6694 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6695 int known_errorHandler = -1;
6696
Benjamin Petersonbac79492012-01-14 13:34:47 -05006697 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006698 return NULL;
6699 size = PyUnicode_GET_LENGTH(unicode);
6700 kind = PyUnicode_KIND(unicode);
6701 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 /* allocate enough for a simple encoding without
6703 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006704 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006705 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006706 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006708 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006709 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710 ressize = size;
6711
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006712 while (pos < size) {
6713 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 /* can we encode this? */
6716 if (c<limit) {
6717 /* no overflow check, because we know that the space is enough */
6718 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006719 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006720 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 Py_ssize_t requiredsize;
6723 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006724 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 Py_ssize_t collstart = pos;
6727 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006729 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 ++collend;
6731 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6732 if (known_errorHandler==-1) {
6733 if ((errors==NULL) || (!strcmp(errors, "strict")))
6734 known_errorHandler = 1;
6735 else if (!strcmp(errors, "replace"))
6736 known_errorHandler = 2;
6737 else if (!strcmp(errors, "ignore"))
6738 known_errorHandler = 3;
6739 else if (!strcmp(errors, "xmlcharrefreplace"))
6740 known_errorHandler = 4;
6741 else
6742 known_errorHandler = 0;
6743 }
6744 switch (known_errorHandler) {
6745 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006746 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 goto onError;
6748 case 2: /* replace */
6749 while (collstart++<collend)
6750 *str++ = '?'; /* fall through */
6751 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 break;
6754 case 4: /* xmlcharrefreplace */
6755 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 /* determine replacement size */
6757 for (i = collstart, repsize = 0; i < collend; ++i) {
6758 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6759 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006765 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006767 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006769 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006771 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006772 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006774 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006776 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 if (requiredsize > ressize) {
6778 if (requiredsize<2*ressize)
6779 requiredsize = 2*ressize;
6780 if (_PyBytes_Resize(&res, requiredsize))
6781 goto onError;
6782 str = PyBytes_AS_STRING(res) + respos;
6783 ressize = requiredsize;
6784 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 /* generate replacement */
6786 for (i = collstart; i < collend; ++i) {
6787 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006789 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 break;
6791 default:
6792 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006793 encoding, reason, unicode, &exc,
6794 collstart, collend, &newpos);
6795 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006796 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006798 if (PyBytes_Check(repunicode)) {
6799 /* Directly copy bytes result to output. */
6800 repsize = PyBytes_Size(repunicode);
6801 if (repsize > 1) {
6802 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006803 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006804 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6805 Py_DECREF(repunicode);
6806 goto onError;
6807 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006808 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006809 ressize += repsize-1;
6810 }
6811 memcpy(str, PyBytes_AsString(repunicode), repsize);
6812 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006813 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006814 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006815 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006816 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 /* need more space? (at least enough for what we
6818 have+the replacement+the rest of the string, so
6819 we won't have to check space for encodable characters) */
6820 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006821 repsize = PyUnicode_GET_LENGTH(repunicode);
6822 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 if (requiredsize > ressize) {
6824 if (requiredsize<2*ressize)
6825 requiredsize = 2*ressize;
6826 if (_PyBytes_Resize(&res, requiredsize)) {
6827 Py_DECREF(repunicode);
6828 goto onError;
6829 }
6830 str = PyBytes_AS_STRING(res) + respos;
6831 ressize = requiredsize;
6832 }
6833 /* check if there is anything unencodable in the replacement
6834 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006835 for (i = 0; repsize-->0; ++i, ++str) {
6836 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006838 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006839 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 Py_DECREF(repunicode);
6841 goto onError;
6842 }
6843 *str = (char)c;
6844 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006845 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006846 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006848 }
6849 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006850 /* Resize if we allocated to much */
6851 size = str - PyBytes_AS_STRING(res);
6852 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006853 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006854 if (_PyBytes_Resize(&res, size) < 0)
6855 goto onError;
6856 }
6857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858 Py_XDECREF(errorHandler);
6859 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006860 return res;
6861
6862 onError:
6863 Py_XDECREF(res);
6864 Py_XDECREF(errorHandler);
6865 Py_XDECREF(exc);
6866 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006867}
6868
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006869/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006870PyObject *
6871PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006872 Py_ssize_t size,
6873 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006875 PyObject *result;
6876 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6877 if (unicode == NULL)
6878 return NULL;
6879 result = unicode_encode_ucs1(unicode, errors, 256);
6880 Py_DECREF(unicode);
6881 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882}
6883
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006885_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
6887 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 PyErr_BadArgument();
6889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006891 if (PyUnicode_READY(unicode) == -1)
6892 return NULL;
6893 /* Fast path: if it is a one-byte string, construct
6894 bytes object directly. */
6895 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6896 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6897 PyUnicode_GET_LENGTH(unicode));
6898 /* Non-Latin-1 characters present. Defer to above function to
6899 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006900 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006901}
6902
6903PyObject*
6904PyUnicode_AsLatin1String(PyObject *unicode)
6905{
6906 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907}
6908
6909/* --- 7-bit ASCII Codec -------------------------------------------------- */
6910
Alexander Belopolsky40018472011-02-26 01:02:56 +00006911PyObject *
6912PyUnicode_DecodeASCII(const char *s,
6913 Py_ssize_t size,
6914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006917 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006918 int kind;
6919 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006920 Py_ssize_t startinpos;
6921 Py_ssize_t endinpos;
6922 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006924 int has_error;
6925 const unsigned char *p = (const unsigned char *)s;
6926 const unsigned char *end = p + size;
6927 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928 PyObject *errorHandler = NULL;
6929 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006930
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006931 if (size == 0) {
6932 Py_INCREF(unicode_empty);
6933 return unicode_empty;
6934 }
6935
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006937 if (size == 1 && (unsigned char)s[0] < 128)
6938 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006939
Victor Stinner702c7342011-10-05 13:50:52 +02006940 has_error = 0;
6941 while (p < end && !has_error) {
6942 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6943 an explanation. */
6944 if (!((size_t) p & LONG_PTR_MASK)) {
6945 /* Help register allocation */
6946 register const unsigned char *_p = p;
6947 while (_p < aligned_end) {
6948 unsigned long value = *(unsigned long *) _p;
6949 if (value & ASCII_CHAR_MASK) {
6950 has_error = 1;
6951 break;
6952 }
6953 _p += SIZEOF_LONG;
6954 }
6955 if (_p == end)
6956 break;
6957 if (has_error)
6958 break;
6959 p = _p;
6960 }
6961 if (*p & 0x80) {
6962 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006963 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006964 }
6965 else {
6966 ++p;
6967 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006968 }
Victor Stinner702c7342011-10-05 13:50:52 +02006969 if (!has_error)
6970 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006971
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006972 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006976 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006977 kind = PyUnicode_KIND(v);
6978 data = PyUnicode_DATA(v);
6979 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006980 e = s + size;
6981 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 register unsigned char c = (unsigned char)*s;
6983 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006984 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 ++s;
6986 }
6987 else {
6988 startinpos = s-starts;
6989 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 if (unicode_decode_call_errorhandler(
6991 errors, &errorHandler,
6992 "ascii", "ordinal not in range(128)",
6993 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006994 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006996 kind = PyUnicode_KIND(v);
6997 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007000 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007001 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007002 Py_XDECREF(errorHandler);
7003 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007004 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007005 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007006
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007009 Py_XDECREF(errorHandler);
7010 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 return NULL;
7012}
7013
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007014/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007015PyObject *
7016PyUnicode_EncodeASCII(const Py_UNICODE *p,
7017 Py_ssize_t size,
7018 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007020 PyObject *result;
7021 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7022 if (unicode == NULL)
7023 return NULL;
7024 result = unicode_encode_ucs1(unicode, errors, 128);
7025 Py_DECREF(unicode);
7026 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027}
7028
Alexander Belopolsky40018472011-02-26 01:02:56 +00007029PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007030_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
7032 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 PyErr_BadArgument();
7034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007036 if (PyUnicode_READY(unicode) == -1)
7037 return NULL;
7038 /* Fast path: if it is an ASCII-only string, construct bytes object
7039 directly. Else defer to above function to raise the exception. */
7040 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7041 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7042 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007043 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007044}
7045
7046PyObject *
7047PyUnicode_AsASCIIString(PyObject *unicode)
7048{
7049 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050}
7051
Victor Stinner99b95382011-07-04 14:23:54 +02007052#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007053
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007054/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007055
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007056#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057#define NEED_RETRY
7058#endif
7059
Victor Stinner3a50e702011-10-18 21:21:00 +02007060#ifndef WC_ERR_INVALID_CHARS
7061# define WC_ERR_INVALID_CHARS 0x0080
7062#endif
7063
7064static char*
7065code_page_name(UINT code_page, PyObject **obj)
7066{
7067 *obj = NULL;
7068 if (code_page == CP_ACP)
7069 return "mbcs";
7070 if (code_page == CP_UTF7)
7071 return "CP_UTF7";
7072 if (code_page == CP_UTF8)
7073 return "CP_UTF8";
7074
7075 *obj = PyBytes_FromFormat("cp%u", code_page);
7076 if (*obj == NULL)
7077 return NULL;
7078 return PyBytes_AS_STRING(*obj);
7079}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080
Alexander Belopolsky40018472011-02-26 01:02:56 +00007081static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007082is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007083{
7084 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007085 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 if (!IsDBCSLeadByteEx(code_page, *curr))
7088 return 0;
7089
7090 prev = CharPrevExA(code_page, s, curr, 0);
7091 if (prev == curr)
7092 return 1;
7093 /* FIXME: This code is limited to "true" double-byte encodings,
7094 as it assumes an incomplete character consists of a single
7095 byte. */
7096 if (curr - prev == 2)
7097 return 1;
7098 if (!IsDBCSLeadByteEx(code_page, *prev))
7099 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100 return 0;
7101}
7102
Victor Stinner3a50e702011-10-18 21:21:00 +02007103static DWORD
7104decode_code_page_flags(UINT code_page)
7105{
7106 if (code_page == CP_UTF7) {
7107 /* The CP_UTF7 decoder only supports flags=0 */
7108 return 0;
7109 }
7110 else
7111 return MB_ERR_INVALID_CHARS;
7112}
7113
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 * Decode a byte string from a Windows code page into unicode object in strict
7116 * mode.
7117 *
7118 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7119 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007121static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007122decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007123 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 const char *in,
7125 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007126{
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007128 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130
7131 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007132 assert(insize > 0);
7133 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7134 if (outsize <= 0)
7135 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136
7137 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007139 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007140 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 if (*v == NULL)
7142 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144 }
7145 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007148 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007151 }
7152
7153 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7155 if (outsize <= 0)
7156 goto error;
7157 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007158
Victor Stinner3a50e702011-10-18 21:21:00 +02007159error:
7160 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7161 return -2;
7162 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007163 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164}
7165
Victor Stinner3a50e702011-10-18 21:21:00 +02007166/*
7167 * Decode a byte string from a code page into unicode object with an error
7168 * handler.
7169 *
7170 * Returns consumed size if succeed, or raise a WindowsError or
7171 * UnicodeDecodeError exception and returns -1 on error.
7172 */
7173static int
7174decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007175 PyObject **v,
7176 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 const char *errors)
7178{
7179 const char *startin = in;
7180 const char *endin = in + size;
7181 const DWORD flags = decode_code_page_flags(code_page);
7182 /* Ideally, we should get reason from FormatMessage. This is the Windows
7183 2000 English version of the message. */
7184 const char *reason = "No mapping for the Unicode character exists "
7185 "in the target code page.";
7186 /* each step cannot decode more than 1 character, but a character can be
7187 represented as a surrogate pair */
7188 wchar_t buffer[2], *startout, *out;
7189 int insize, outsize;
7190 PyObject *errorHandler = NULL;
7191 PyObject *exc = NULL;
7192 PyObject *encoding_obj = NULL;
7193 char *encoding;
7194 DWORD err;
7195 int ret = -1;
7196
7197 assert(size > 0);
7198
7199 encoding = code_page_name(code_page, &encoding_obj);
7200 if (encoding == NULL)
7201 return -1;
7202
7203 if (errors == NULL || strcmp(errors, "strict") == 0) {
7204 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7205 UnicodeDecodeError. */
7206 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7207 if (exc != NULL) {
7208 PyCodec_StrictErrors(exc);
7209 Py_CLEAR(exc);
7210 }
7211 goto error;
7212 }
7213
7214 if (*v == NULL) {
7215 /* Create unicode object */
7216 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7217 PyErr_NoMemory();
7218 goto error;
7219 }
Victor Stinnerab595942011-12-17 04:59:06 +01007220 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007221 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 if (*v == NULL)
7223 goto error;
7224 startout = PyUnicode_AS_UNICODE(*v);
7225 }
7226 else {
7227 /* Extend unicode object */
7228 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7229 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7230 PyErr_NoMemory();
7231 goto error;
7232 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007233 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 goto error;
7235 startout = PyUnicode_AS_UNICODE(*v) + n;
7236 }
7237
7238 /* Decode the byte string character per character */
7239 out = startout;
7240 while (in < endin)
7241 {
7242 /* Decode a character */
7243 insize = 1;
7244 do
7245 {
7246 outsize = MultiByteToWideChar(code_page, flags,
7247 in, insize,
7248 buffer, Py_ARRAY_LENGTH(buffer));
7249 if (outsize > 0)
7250 break;
7251 err = GetLastError();
7252 if (err != ERROR_NO_UNICODE_TRANSLATION
7253 && err != ERROR_INSUFFICIENT_BUFFER)
7254 {
7255 PyErr_SetFromWindowsErr(0);
7256 goto error;
7257 }
7258 insize++;
7259 }
7260 /* 4=maximum length of a UTF-8 sequence */
7261 while (insize <= 4 && (in + insize) <= endin);
7262
7263 if (outsize <= 0) {
7264 Py_ssize_t startinpos, endinpos, outpos;
7265
7266 startinpos = in - startin;
7267 endinpos = startinpos + 1;
7268 outpos = out - PyUnicode_AS_UNICODE(*v);
7269 if (unicode_decode_call_errorhandler(
7270 errors, &errorHandler,
7271 encoding, reason,
7272 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007273 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 {
7275 goto error;
7276 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007277 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 }
7279 else {
7280 in += insize;
7281 memcpy(out, buffer, outsize * sizeof(wchar_t));
7282 out += outsize;
7283 }
7284 }
7285
7286 /* write a NUL character at the end */
7287 *out = 0;
7288
7289 /* Extend unicode object */
7290 outsize = out - startout;
7291 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007292 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007294 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007295
7296error:
7297 Py_XDECREF(encoding_obj);
7298 Py_XDECREF(errorHandler);
7299 Py_XDECREF(exc);
7300 return ret;
7301}
7302
Victor Stinner3a50e702011-10-18 21:21:00 +02007303static PyObject *
7304decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007305 const char *s, Py_ssize_t size,
7306 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007307{
Victor Stinner76a31a62011-11-04 00:05:13 +01007308 PyObject *v = NULL;
7309 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 if (code_page < 0) {
7312 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7313 return NULL;
7314 }
7315
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007318
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 do
7320 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007322 if (size > INT_MAX) {
7323 chunk_size = INT_MAX;
7324 final = 0;
7325 done = 0;
7326 }
7327 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007329 {
7330 chunk_size = (int)size;
7331 final = (consumed == NULL);
7332 done = 1;
7333 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007334
Victor Stinner76a31a62011-11-04 00:05:13 +01007335 /* Skip trailing lead-byte unless 'final' is set */
7336 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7337 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 if (chunk_size == 0 && done) {
7340 if (v != NULL)
7341 break;
7342 Py_INCREF(unicode_empty);
7343 return unicode_empty;
7344 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007345
Victor Stinner76a31a62011-11-04 00:05:13 +01007346
7347 converted = decode_code_page_strict(code_page, &v,
7348 s, chunk_size);
7349 if (converted == -2)
7350 converted = decode_code_page_errors(code_page, &v,
7351 s, chunk_size,
7352 errors);
7353 assert(converted != 0);
7354
7355 if (converted < 0) {
7356 Py_XDECREF(v);
7357 return NULL;
7358 }
7359
7360 if (consumed)
7361 *consumed += converted;
7362
7363 s += converted;
7364 size -= converted;
7365 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007366
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007367 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368}
7369
Alexander Belopolsky40018472011-02-26 01:02:56 +00007370PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007371PyUnicode_DecodeCodePageStateful(int code_page,
7372 const char *s,
7373 Py_ssize_t size,
7374 const char *errors,
7375 Py_ssize_t *consumed)
7376{
7377 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7378}
7379
7380PyObject *
7381PyUnicode_DecodeMBCSStateful(const char *s,
7382 Py_ssize_t size,
7383 const char *errors,
7384 Py_ssize_t *consumed)
7385{
7386 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7387}
7388
7389PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007390PyUnicode_DecodeMBCS(const char *s,
7391 Py_ssize_t size,
7392 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007393{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007394 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7395}
7396
Victor Stinner3a50e702011-10-18 21:21:00 +02007397static DWORD
7398encode_code_page_flags(UINT code_page, const char *errors)
7399{
7400 if (code_page == CP_UTF8) {
7401 if (winver.dwMajorVersion >= 6)
7402 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7403 and later */
7404 return WC_ERR_INVALID_CHARS;
7405 else
7406 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7407 return 0;
7408 }
7409 else if (code_page == CP_UTF7) {
7410 /* CP_UTF7 only supports flags=0 */
7411 return 0;
7412 }
7413 else {
7414 if (errors != NULL && strcmp(errors, "replace") == 0)
7415 return 0;
7416 else
7417 return WC_NO_BEST_FIT_CHARS;
7418 }
7419}
7420
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 * Encode a Unicode string to a Windows code page into a byte string in strict
7423 * mode.
7424 *
7425 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7426 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007428static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007429encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007430 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432{
Victor Stinner554f3f02010-06-16 23:33:54 +00007433 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 BOOL *pusedDefaultChar = &usedDefaultChar;
7435 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007436 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007437 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007438 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 const DWORD flags = encode_code_page_flags(code_page, NULL);
7440 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007441 /* Create a substring so that we can get the UTF-16 representation
7442 of just the slice under consideration. */
7443 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444
Martin v. Löwis3d325192011-11-04 18:23:06 +01007445 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007446
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007448 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007450 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007451
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 substring = PyUnicode_Substring(unicode, offset, offset+len);
7453 if (substring == NULL)
7454 return -1;
7455 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7456 if (p == NULL) {
7457 Py_DECREF(substring);
7458 return -1;
7459 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007460
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007461 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 outsize = WideCharToMultiByte(code_page, flags,
7463 p, size,
7464 NULL, 0,
7465 NULL, pusedDefaultChar);
7466 if (outsize <= 0)
7467 goto error;
7468 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007469 if (pusedDefaultChar && *pusedDefaultChar) {
7470 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007473
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 if (*outbytes == NULL) {
7478 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007480 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482 }
7483 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 const Py_ssize_t n = PyBytes_Size(*outbytes);
7486 if (outsize > PY_SSIZE_T_MAX - n) {
7487 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007488 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7492 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496 }
7497
7498 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 outsize = WideCharToMultiByte(code_page, flags,
7500 p, size,
7501 out, outsize,
7502 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 if (outsize <= 0)
7505 goto error;
7506 if (pusedDefaultChar && *pusedDefaultChar)
7507 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007508 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007509
Victor Stinner3a50e702011-10-18 21:21:00 +02007510error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007511 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7513 return -2;
7514 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007515 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007516}
7517
Victor Stinner3a50e702011-10-18 21:21:00 +02007518/*
7519 * Encode a Unicode string to a Windows code page into a byte string using a
7520 * error handler.
7521 *
7522 * Returns consumed characters if succeed, or raise a WindowsError and returns
7523 * -1 on other error.
7524 */
7525static int
7526encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007527 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007528 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007529{
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007531 Py_ssize_t pos = unicode_offset;
7532 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 /* Ideally, we should get reason from FormatMessage. This is the Windows
7534 2000 English version of the message. */
7535 const char *reason = "invalid character";
7536 /* 4=maximum length of a UTF-8 sequence */
7537 char buffer[4];
7538 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7539 Py_ssize_t outsize;
7540 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 PyObject *errorHandler = NULL;
7542 PyObject *exc = NULL;
7543 PyObject *encoding_obj = NULL;
7544 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007545 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007546 PyObject *rep;
7547 int ret = -1;
7548
7549 assert(insize > 0);
7550
7551 encoding = code_page_name(code_page, &encoding_obj);
7552 if (encoding == NULL)
7553 return -1;
7554
7555 if (errors == NULL || strcmp(errors, "strict") == 0) {
7556 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7557 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007558 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007559 if (exc != NULL) {
7560 PyCodec_StrictErrors(exc);
7561 Py_DECREF(exc);
7562 }
7563 Py_XDECREF(encoding_obj);
7564 return -1;
7565 }
7566
7567 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7568 pusedDefaultChar = &usedDefaultChar;
7569 else
7570 pusedDefaultChar = NULL;
7571
7572 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7573 PyErr_NoMemory();
7574 goto error;
7575 }
7576 outsize = insize * Py_ARRAY_LENGTH(buffer);
7577
7578 if (*outbytes == NULL) {
7579 /* Create string object */
7580 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7581 if (*outbytes == NULL)
7582 goto error;
7583 out = PyBytes_AS_STRING(*outbytes);
7584 }
7585 else {
7586 /* Extend string object */
7587 Py_ssize_t n = PyBytes_Size(*outbytes);
7588 if (n > PY_SSIZE_T_MAX - outsize) {
7589 PyErr_NoMemory();
7590 goto error;
7591 }
7592 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7593 goto error;
7594 out = PyBytes_AS_STRING(*outbytes) + n;
7595 }
7596
7597 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007598 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007600 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7601 wchar_t chars[2];
7602 int charsize;
7603 if (ch < 0x10000) {
7604 chars[0] = (wchar_t)ch;
7605 charsize = 1;
7606 }
7607 else {
7608 ch -= 0x10000;
7609 chars[0] = 0xd800 + (ch >> 10);
7610 chars[1] = 0xdc00 + (ch & 0x3ff);
7611 charsize = 2;
7612 }
7613
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007615 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 buffer, Py_ARRAY_LENGTH(buffer),
7617 NULL, pusedDefaultChar);
7618 if (outsize > 0) {
7619 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7620 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007621 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007622 memcpy(out, buffer, outsize);
7623 out += outsize;
7624 continue;
7625 }
7626 }
7627 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7628 PyErr_SetFromWindowsErr(0);
7629 goto error;
7630 }
7631
Victor Stinner3a50e702011-10-18 21:21:00 +02007632 rep = unicode_encode_call_errorhandler(
7633 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007634 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007635 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 if (rep == NULL)
7637 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007638 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007639
7640 if (PyBytes_Check(rep)) {
7641 outsize = PyBytes_GET_SIZE(rep);
7642 if (outsize != 1) {
7643 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7644 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7645 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7646 Py_DECREF(rep);
7647 goto error;
7648 }
7649 out = PyBytes_AS_STRING(*outbytes) + offset;
7650 }
7651 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7652 out += outsize;
7653 }
7654 else {
7655 Py_ssize_t i;
7656 enum PyUnicode_Kind kind;
7657 void *data;
7658
Benjamin Petersonbac79492012-01-14 13:34:47 -05007659 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007660 Py_DECREF(rep);
7661 goto error;
7662 }
7663
7664 outsize = PyUnicode_GET_LENGTH(rep);
7665 if (outsize != 1) {
7666 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7667 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7668 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7669 Py_DECREF(rep);
7670 goto error;
7671 }
7672 out = PyBytes_AS_STRING(*outbytes) + offset;
7673 }
7674 kind = PyUnicode_KIND(rep);
7675 data = PyUnicode_DATA(rep);
7676 for (i=0; i < outsize; i++) {
7677 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7678 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007679 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007680 encoding, unicode,
7681 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007682 "unable to encode error handler result to ASCII");
7683 Py_DECREF(rep);
7684 goto error;
7685 }
7686 *out = (unsigned char)ch;
7687 out++;
7688 }
7689 }
7690 Py_DECREF(rep);
7691 }
7692 /* write a NUL byte */
7693 *out = 0;
7694 outsize = out - PyBytes_AS_STRING(*outbytes);
7695 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7696 if (_PyBytes_Resize(outbytes, outsize) < 0)
7697 goto error;
7698 ret = 0;
7699
7700error:
7701 Py_XDECREF(encoding_obj);
7702 Py_XDECREF(errorHandler);
7703 Py_XDECREF(exc);
7704 return ret;
7705}
7706
Victor Stinner3a50e702011-10-18 21:21:00 +02007707static PyObject *
7708encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007709 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007710 const char *errors)
7711{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007712 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007714 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007715 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007716
Benjamin Petersonbac79492012-01-14 13:34:47 -05007717 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007718 return NULL;
7719 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007720
Victor Stinner3a50e702011-10-18 21:21:00 +02007721 if (code_page < 0) {
7722 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7723 return NULL;
7724 }
7725
Martin v. Löwis3d325192011-11-04 18:23:06 +01007726 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007727 return PyBytes_FromStringAndSize(NULL, 0);
7728
Victor Stinner7581cef2011-11-03 22:32:33 +01007729 offset = 0;
7730 do
7731 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007732#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007733 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007734 chunks. */
7735 if (len > INT_MAX/2) {
7736 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007737 done = 0;
7738 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007739 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007740#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007741 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007743 done = 1;
7744 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007745
Victor Stinner76a31a62011-11-04 00:05:13 +01007746 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007747 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007748 errors);
7749 if (ret == -2)
7750 ret = encode_code_page_errors(code_page, &outbytes,
7751 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007752 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007753 if (ret < 0) {
7754 Py_XDECREF(outbytes);
7755 return NULL;
7756 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007757
Victor Stinner7581cef2011-11-03 22:32:33 +01007758 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007759 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007760 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007761
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 return outbytes;
7763}
7764
7765PyObject *
7766PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7767 Py_ssize_t size,
7768 const char *errors)
7769{
Victor Stinner7581cef2011-11-03 22:32:33 +01007770 PyObject *unicode, *res;
7771 unicode = PyUnicode_FromUnicode(p, size);
7772 if (unicode == NULL)
7773 return NULL;
7774 res = encode_code_page(CP_ACP, unicode, errors);
7775 Py_DECREF(unicode);
7776 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007777}
7778
7779PyObject *
7780PyUnicode_EncodeCodePage(int code_page,
7781 PyObject *unicode,
7782 const char *errors)
7783{
Victor Stinner7581cef2011-11-03 22:32:33 +01007784 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007785}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007786
Alexander Belopolsky40018472011-02-26 01:02:56 +00007787PyObject *
7788PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007789{
7790 if (!PyUnicode_Check(unicode)) {
7791 PyErr_BadArgument();
7792 return NULL;
7793 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007794 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007795}
7796
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007797#undef NEED_RETRY
7798
Victor Stinner99b95382011-07-04 14:23:54 +02007799#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007800
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801/* --- Character Mapping Codec -------------------------------------------- */
7802
Alexander Belopolsky40018472011-02-26 01:02:56 +00007803PyObject *
7804PyUnicode_DecodeCharmap(const char *s,
7805 Py_ssize_t size,
7806 PyObject *mapping,
7807 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007809 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007810 Py_ssize_t startinpos;
7811 Py_ssize_t endinpos;
7812 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007813 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007814 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007815 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816 PyObject *errorHandler = NULL;
7817 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007818
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 /* Default to Latin-1 */
7820 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007823 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007827 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007828 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007829 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007830 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007831 Py_ssize_t maplen;
7832 enum PyUnicode_Kind kind;
7833 void *data;
7834 Py_UCS4 x;
7835
Benjamin Petersonbac79492012-01-14 13:34:47 -05007836 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007837 return NULL;
7838
7839 maplen = PyUnicode_GET_LENGTH(mapping);
7840 data = PyUnicode_DATA(mapping);
7841 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 while (s < e) {
7843 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007846 x = PyUnicode_READ(kind, data, ch);
7847 else
7848 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007850 if (x == 0xfffe)
7851 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 startinpos = s-starts;
7854 endinpos = startinpos+1;
7855 if (unicode_decode_call_errorhandler(
7856 errors, &errorHandler,
7857 "charmap", "character maps to <undefined>",
7858 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007859 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 goto onError;
7861 }
7862 continue;
7863 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007864
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007865 if (unicode_putchar(&v, &outpos, x) < 0)
7866 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007868 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007869 }
7870 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 while (s < e) {
7872 unsigned char ch = *s;
7873 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007874
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7876 w = PyLong_FromLong((long)ch);
7877 if (w == NULL)
7878 goto onError;
7879 x = PyObject_GetItem(mapping, w);
7880 Py_DECREF(w);
7881 if (x == NULL) {
7882 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7883 /* No mapping found means: mapping is undefined. */
7884 PyErr_Clear();
7885 x = Py_None;
7886 Py_INCREF(x);
7887 } else
7888 goto onError;
7889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007890
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 /* Apply mapping */
7892 if (PyLong_Check(x)) {
7893 long value = PyLong_AS_LONG(x);
7894 if (value < 0 || value > 65535) {
7895 PyErr_SetString(PyExc_TypeError,
7896 "character mapping must be in range(65536)");
7897 Py_DECREF(x);
7898 goto onError;
7899 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007900 if (unicode_putchar(&v, &outpos, value) < 0)
7901 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 }
7903 else if (x == Py_None) {
7904 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 startinpos = s-starts;
7906 endinpos = startinpos+1;
7907 if (unicode_decode_call_errorhandler(
7908 errors, &errorHandler,
7909 "charmap", "character maps to <undefined>",
7910 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007911 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 Py_DECREF(x);
7913 goto onError;
7914 }
7915 Py_DECREF(x);
7916 continue;
7917 }
7918 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007919 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920
Benjamin Petersonbac79492012-01-14 13:34:47 -05007921 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007922 goto onError;
7923 targetsize = PyUnicode_GET_LENGTH(x);
7924
7925 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007927 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007928 PyUnicode_READ_CHAR(x, 0)) < 0)
7929 goto onError;
7930 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 else if (targetsize > 1) {
7932 /* 1-n mapping */
7933 if (targetsize > extrachars) {
7934 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 Py_ssize_t needed = (targetsize - extrachars) + \
7936 (targetsize << 2);
7937 extrachars += needed;
7938 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007939 if (unicode_resize(&v,
7940 PyUnicode_GET_LENGTH(v) + needed) < 0)
7941 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 Py_DECREF(x);
7943 goto onError;
7944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007946 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7947 goto onError;
7948 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7949 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 extrachars -= targetsize;
7951 }
7952 /* 1-0 mapping: skip the character */
7953 }
7954 else {
7955 /* wrong return value */
7956 PyErr_SetString(PyExc_TypeError,
7957 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007958 Py_DECREF(x);
7959 goto onError;
7960 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 Py_DECREF(x);
7962 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007965 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007966 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007967 Py_XDECREF(errorHandler);
7968 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007969 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007970
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 Py_XDECREF(errorHandler);
7973 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 Py_XDECREF(v);
7975 return NULL;
7976}
7977
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007978/* Charmap encoding: the lookup table */
7979
Alexander Belopolsky40018472011-02-26 01:02:56 +00007980struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 PyObject_HEAD
7982 unsigned char level1[32];
7983 int count2, count3;
7984 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007985};
7986
7987static PyObject*
7988encoding_map_size(PyObject *obj, PyObject* args)
7989{
7990 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007991 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007993}
7994
7995static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007996 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 PyDoc_STR("Return the size (in bytes) of this object") },
7998 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007999};
8000
8001static void
8002encoding_map_dealloc(PyObject* o)
8003{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008005}
8006
8007static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 "EncodingMap", /*tp_name*/
8010 sizeof(struct encoding_map), /*tp_basicsize*/
8011 0, /*tp_itemsize*/
8012 /* methods */
8013 encoding_map_dealloc, /*tp_dealloc*/
8014 0, /*tp_print*/
8015 0, /*tp_getattr*/
8016 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008017 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 0, /*tp_repr*/
8019 0, /*tp_as_number*/
8020 0, /*tp_as_sequence*/
8021 0, /*tp_as_mapping*/
8022 0, /*tp_hash*/
8023 0, /*tp_call*/
8024 0, /*tp_str*/
8025 0, /*tp_getattro*/
8026 0, /*tp_setattro*/
8027 0, /*tp_as_buffer*/
8028 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8029 0, /*tp_doc*/
8030 0, /*tp_traverse*/
8031 0, /*tp_clear*/
8032 0, /*tp_richcompare*/
8033 0, /*tp_weaklistoffset*/
8034 0, /*tp_iter*/
8035 0, /*tp_iternext*/
8036 encoding_map_methods, /*tp_methods*/
8037 0, /*tp_members*/
8038 0, /*tp_getset*/
8039 0, /*tp_base*/
8040 0, /*tp_dict*/
8041 0, /*tp_descr_get*/
8042 0, /*tp_descr_set*/
8043 0, /*tp_dictoffset*/
8044 0, /*tp_init*/
8045 0, /*tp_alloc*/
8046 0, /*tp_new*/
8047 0, /*tp_free*/
8048 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049};
8050
8051PyObject*
8052PyUnicode_BuildEncodingMap(PyObject* string)
8053{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008054 PyObject *result;
8055 struct encoding_map *mresult;
8056 int i;
8057 int need_dict = 0;
8058 unsigned char level1[32];
8059 unsigned char level2[512];
8060 unsigned char *mlevel1, *mlevel2, *mlevel3;
8061 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008062 int kind;
8063 void *data;
8064 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008066 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 PyErr_BadArgument();
8068 return NULL;
8069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 kind = PyUnicode_KIND(string);
8071 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008072 memset(level1, 0xFF, sizeof level1);
8073 memset(level2, 0xFF, sizeof level2);
8074
8075 /* If there isn't a one-to-one mapping of NULL to \0,
8076 or if there are non-BMP characters, we need to use
8077 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008078 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079 need_dict = 1;
8080 for (i = 1; i < 256; i++) {
8081 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008082 ch = PyUnicode_READ(kind, data, i);
8083 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008084 need_dict = 1;
8085 break;
8086 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008087 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008088 /* unmapped character */
8089 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008090 l1 = ch >> 11;
8091 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008092 if (level1[l1] == 0xFF)
8093 level1[l1] = count2++;
8094 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008095 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008096 }
8097
8098 if (count2 >= 0xFF || count3 >= 0xFF)
8099 need_dict = 1;
8100
8101 if (need_dict) {
8102 PyObject *result = PyDict_New();
8103 PyObject *key, *value;
8104 if (!result)
8105 return NULL;
8106 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008108 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109 if (!key || !value)
8110 goto failed1;
8111 if (PyDict_SetItem(result, key, value) == -1)
8112 goto failed1;
8113 Py_DECREF(key);
8114 Py_DECREF(value);
8115 }
8116 return result;
8117 failed1:
8118 Py_XDECREF(key);
8119 Py_XDECREF(value);
8120 Py_DECREF(result);
8121 return NULL;
8122 }
8123
8124 /* Create a three-level trie */
8125 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8126 16*count2 + 128*count3 - 1);
8127 if (!result)
8128 return PyErr_NoMemory();
8129 PyObject_Init(result, &EncodingMapType);
8130 mresult = (struct encoding_map*)result;
8131 mresult->count2 = count2;
8132 mresult->count3 = count3;
8133 mlevel1 = mresult->level1;
8134 mlevel2 = mresult->level23;
8135 mlevel3 = mresult->level23 + 16*count2;
8136 memcpy(mlevel1, level1, 32);
8137 memset(mlevel2, 0xFF, 16*count2);
8138 memset(mlevel3, 0, 128*count3);
8139 count3 = 0;
8140 for (i = 1; i < 256; i++) {
8141 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008142 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 /* unmapped character */
8144 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008145 o1 = PyUnicode_READ(kind, data, i)>>11;
8146 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 i2 = 16*mlevel1[o1] + o2;
8148 if (mlevel2[i2] == 0xFF)
8149 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008150 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008151 i3 = 128*mlevel2[i2] + o3;
8152 mlevel3[i3] = i;
8153 }
8154 return result;
8155}
8156
8157static int
Victor Stinner22168992011-11-20 17:09:18 +01008158encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159{
8160 struct encoding_map *map = (struct encoding_map*)mapping;
8161 int l1 = c>>11;
8162 int l2 = (c>>7) & 0xF;
8163 int l3 = c & 0x7F;
8164 int i;
8165
Victor Stinner22168992011-11-20 17:09:18 +01008166 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008168 if (c == 0)
8169 return 0;
8170 /* level 1*/
8171 i = map->level1[l1];
8172 if (i == 0xFF) {
8173 return -1;
8174 }
8175 /* level 2*/
8176 i = map->level23[16*i+l2];
8177 if (i == 0xFF) {
8178 return -1;
8179 }
8180 /* level 3 */
8181 i = map->level23[16*map->count2 + 128*i + l3];
8182 if (i == 0) {
8183 return -1;
8184 }
8185 return i;
8186}
8187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008188/* Lookup the character ch in the mapping. If the character
8189 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008190 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008191static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008192charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193{
Christian Heimes217cfd12007-12-02 14:31:20 +00008194 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008195 PyObject *x;
8196
8197 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 x = PyObject_GetItem(mapping, w);
8200 Py_DECREF(w);
8201 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8203 /* No mapping found means: mapping is undefined. */
8204 PyErr_Clear();
8205 x = Py_None;
8206 Py_INCREF(x);
8207 return x;
8208 } else
8209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008211 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008213 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 long value = PyLong_AS_LONG(x);
8215 if (value < 0 || value > 255) {
8216 PyErr_SetString(PyExc_TypeError,
8217 "character mapping must be in range(256)");
8218 Py_DECREF(x);
8219 return NULL;
8220 }
8221 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008223 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 /* wrong return value */
8227 PyErr_Format(PyExc_TypeError,
8228 "character mapping must return integer, bytes or None, not %.400s",
8229 x->ob_type->tp_name);
8230 Py_DECREF(x);
8231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232 }
8233}
8234
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008235static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008236charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008237{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008238 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8239 /* exponentially overallocate to minimize reallocations */
8240 if (requiredsize < 2*outsize)
8241 requiredsize = 2*outsize;
8242 if (_PyBytes_Resize(outobj, requiredsize))
8243 return -1;
8244 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008245}
8246
Benjamin Peterson14339b62009-01-31 16:36:08 +00008247typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008249} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008251 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 space is available. Return a new reference to the object that
8253 was put in the output buffer, or Py_None, if the mapping was undefined
8254 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008255 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008256static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008257charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008258 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008260 PyObject *rep;
8261 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008262 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263
Christian Heimes90aa7642007-12-19 02:45:37 +00008264 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008267 if (res == -1)
8268 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 if (outsize<requiredsize)
8270 if (charmapencode_resize(outobj, outpos, requiredsize))
8271 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008272 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 outstart[(*outpos)++] = (char)res;
8274 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008275 }
8276
8277 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008280 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 Py_DECREF(rep);
8282 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008283 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 if (PyLong_Check(rep)) {
8285 Py_ssize_t requiredsize = *outpos+1;
8286 if (outsize<requiredsize)
8287 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8288 Py_DECREF(rep);
8289 return enc_EXCEPTION;
8290 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008291 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008293 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 else {
8295 const char *repchars = PyBytes_AS_STRING(rep);
8296 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8297 Py_ssize_t requiredsize = *outpos+repsize;
8298 if (outsize<requiredsize)
8299 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8300 Py_DECREF(rep);
8301 return enc_EXCEPTION;
8302 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008303 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 memcpy(outstart + *outpos, repchars, repsize);
8305 *outpos += repsize;
8306 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008308 Py_DECREF(rep);
8309 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310}
8311
8312/* handle an error in PyUnicode_EncodeCharmap
8313 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008314static int
8315charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008316 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008318 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008319 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320{
8321 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008322 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008323 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008324 enum PyUnicode_Kind kind;
8325 void *data;
8326 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008328 Py_ssize_t collstartpos = *inpos;
8329 Py_ssize_t collendpos = *inpos+1;
8330 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 char *encoding = "charmap";
8332 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008334 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008335 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336
Benjamin Petersonbac79492012-01-14 13:34:47 -05008337 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008338 return -1;
8339 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340 /* find all unencodable characters */
8341 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008343 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008344 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008345 val = encoding_map_lookup(ch, mapping);
8346 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 break;
8348 ++collendpos;
8349 continue;
8350 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008351
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008352 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8353 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 if (rep==NULL)
8355 return -1;
8356 else if (rep!=Py_None) {
8357 Py_DECREF(rep);
8358 break;
8359 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008360 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 }
8363 /* cache callback name lookup
8364 * (if not done yet, i.e. it's the first error) */
8365 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 if ((errors==NULL) || (!strcmp(errors, "strict")))
8367 *known_errorHandler = 1;
8368 else if (!strcmp(errors, "replace"))
8369 *known_errorHandler = 2;
8370 else if (!strcmp(errors, "ignore"))
8371 *known_errorHandler = 3;
8372 else if (!strcmp(errors, "xmlcharrefreplace"))
8373 *known_errorHandler = 4;
8374 else
8375 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 }
8377 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008378 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008379 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008380 return -1;
8381 case 2: /* replace */
8382 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 x = charmapencode_output('?', mapping, res, respos);
8384 if (x==enc_EXCEPTION) {
8385 return -1;
8386 }
8387 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008388 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 return -1;
8390 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008391 }
8392 /* fall through */
8393 case 3: /* ignore */
8394 *inpos = collendpos;
8395 break;
8396 case 4: /* xmlcharrefreplace */
8397 /* generate replacement (temporarily (mis)uses p) */
8398 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 char buffer[2+29+1+1];
8400 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008401 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 for (cp = buffer; *cp; ++cp) {
8403 x = charmapencode_output(*cp, mapping, res, respos);
8404 if (x==enc_EXCEPTION)
8405 return -1;
8406 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008407 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 return -1;
8409 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 }
8411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412 *inpos = collendpos;
8413 break;
8414 default:
8415 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008416 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008420 if (PyBytes_Check(repunicode)) {
8421 /* Directly copy bytes result to output. */
8422 Py_ssize_t outsize = PyBytes_Size(*res);
8423 Py_ssize_t requiredsize;
8424 repsize = PyBytes_Size(repunicode);
8425 requiredsize = *respos + repsize;
8426 if (requiredsize > outsize)
8427 /* Make room for all additional bytes. */
8428 if (charmapencode_resize(res, respos, requiredsize)) {
8429 Py_DECREF(repunicode);
8430 return -1;
8431 }
8432 memcpy(PyBytes_AsString(*res) + *respos,
8433 PyBytes_AsString(repunicode), repsize);
8434 *respos += repsize;
8435 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008436 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008437 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008438 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008439 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008440 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008441 Py_DECREF(repunicode);
8442 return -1;
8443 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008444 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008445 data = PyUnicode_DATA(repunicode);
8446 kind = PyUnicode_KIND(repunicode);
8447 for (index = 0; index < repsize; index++) {
8448 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8449 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008451 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 return -1;
8453 }
8454 else if (x==enc_FAILED) {
8455 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008456 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 return -1;
8458 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459 }
8460 *inpos = newpos;
8461 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 }
8463 return 0;
8464}
8465
Alexander Belopolsky40018472011-02-26 01:02:56 +00008466PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008467_PyUnicode_EncodeCharmap(PyObject *unicode,
8468 PyObject *mapping,
8469 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471 /* output object */
8472 PyObject *res = NULL;
8473 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008474 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008475 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008477 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478 PyObject *errorHandler = NULL;
8479 PyObject *exc = NULL;
8480 /* the following variable is used for caching string comparisons
8481 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8482 * 3=ignore, 4=xmlcharrefreplace */
8483 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
Benjamin Petersonbac79492012-01-14 13:34:47 -05008485 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008486 return NULL;
8487 size = PyUnicode_GET_LENGTH(unicode);
8488
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 /* Default to Latin-1 */
8490 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008491 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 /* allocate enough for a simple encoding without
8494 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008495 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496 if (res == NULL)
8497 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008498 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008501 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008502 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008504 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 if (x==enc_EXCEPTION) /* error */
8506 goto onError;
8507 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008508 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 &exc,
8510 &known_errorHandler, &errorHandler, errors,
8511 &res, &respos)) {
8512 goto onError;
8513 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 else
8516 /* done with this character => adjust input position */
8517 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008521 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008522 if (_PyBytes_Resize(&res, respos) < 0)
8523 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 Py_XDECREF(exc);
8526 Py_XDECREF(errorHandler);
8527 return res;
8528
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 Py_XDECREF(res);
8531 Py_XDECREF(exc);
8532 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533 return NULL;
8534}
8535
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536/* Deprecated */
8537PyObject *
8538PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8539 Py_ssize_t size,
8540 PyObject *mapping,
8541 const char *errors)
8542{
8543 PyObject *result;
8544 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8545 if (unicode == NULL)
8546 return NULL;
8547 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8548 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008549 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550}
8551
Alexander Belopolsky40018472011-02-26 01:02:56 +00008552PyObject *
8553PyUnicode_AsCharmapString(PyObject *unicode,
8554 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555{
8556 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 PyErr_BadArgument();
8558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561}
8562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008564static void
8565make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008567 Py_ssize_t startpos, Py_ssize_t endpos,
8568 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 *exceptionObject = _PyUnicodeTranslateError_Create(
8572 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 }
8574 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8576 goto onError;
8577 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8578 goto onError;
8579 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8580 goto onError;
8581 return;
8582 onError:
8583 Py_DECREF(*exceptionObject);
8584 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 }
8586}
8587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008589static void
8590raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008592 Py_ssize_t startpos, Py_ssize_t endpos,
8593 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594{
8595 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599}
8600
8601/* error handling callback helper:
8602 build arguments, call the callback and check the arguments,
8603 put the result into newpos and return the replacement string, which
8604 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008605static PyObject *
8606unicode_translate_call_errorhandler(const char *errors,
8607 PyObject **errorHandler,
8608 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008610 Py_ssize_t startpos, Py_ssize_t endpos,
8611 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008613 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008614
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008615 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616 PyObject *restuple;
8617 PyObject *resunicode;
8618
8619 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008621 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 }
8624
8625 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629
8630 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008635 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 Py_DECREF(restuple);
8637 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 }
8639 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 &resunicode, &i_newpos)) {
8641 Py_DECREF(restuple);
8642 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008644 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008646 else
8647 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8650 Py_DECREF(restuple);
8651 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 Py_INCREF(resunicode);
8654 Py_DECREF(restuple);
8655 return resunicode;
8656}
8657
8658/* Lookup the character ch in the mapping and put the result in result,
8659 which must be decrefed by the caller.
8660 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008661static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663{
Christian Heimes217cfd12007-12-02 14:31:20 +00008664 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 PyObject *x;
8666
8667 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 x = PyObject_GetItem(mapping, w);
8670 Py_DECREF(w);
8671 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8673 /* No mapping found means: use 1:1 mapping. */
8674 PyErr_Clear();
8675 *result = NULL;
8676 return 0;
8677 } else
8678 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 }
8680 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 *result = x;
8682 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008684 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 long value = PyLong_AS_LONG(x);
8686 long max = PyUnicode_GetMax();
8687 if (value < 0 || value > max) {
8688 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008689 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 Py_DECREF(x);
8691 return -1;
8692 }
8693 *result = x;
8694 return 0;
8695 }
8696 else if (PyUnicode_Check(x)) {
8697 *result = x;
8698 return 0;
8699 }
8700 else {
8701 /* wrong return value */
8702 PyErr_SetString(PyExc_TypeError,
8703 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008704 Py_DECREF(x);
8705 return -1;
8706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707}
8708/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 if not reallocate and adjust various state variables.
8710 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008711static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008716 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 /* exponentially overallocate to minimize reallocations */
8718 if (requiredsize < 2 * oldsize)
8719 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8721 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 }
8725 return 0;
8726}
8727/* lookup the character, put the result in the output string and adjust
8728 various state variables. Return a new reference to the object that
8729 was put in the output buffer in *result, or Py_None, if the mapping was
8730 undefined (in which case no character was written).
8731 The called must decref result.
8732 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008733static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8735 PyObject *mapping, Py_UCS4 **output,
8736 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008737 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8740 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008745 }
8746 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008748 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008751 }
8752 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 Py_ssize_t repsize;
8754 if (PyUnicode_READY(*res) == -1)
8755 return -1;
8756 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 if (repsize==1) {
8758 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 }
8761 else if (repsize!=0) {
8762 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 Py_ssize_t requiredsize = *opos +
8764 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766 Py_ssize_t i;
8767 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 for(i = 0; i < repsize; i++)
8770 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 }
8773 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 return 0;
8776}
8777
Alexander Belopolsky40018472011-02-26 01:02:56 +00008778PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779_PyUnicode_TranslateCharmap(PyObject *input,
8780 PyObject *mapping,
8781 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 /* input object */
8784 char *idata;
8785 Py_ssize_t size, i;
8786 int kind;
8787 /* output buffer */
8788 Py_UCS4 *output = NULL;
8789 Py_ssize_t osize;
8790 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008791 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793 char *reason = "character maps to <undefined>";
8794 PyObject *errorHandler = NULL;
8795 PyObject *exc = NULL;
8796 /* the following variable is used for caching string comparisons
8797 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8798 * 3=ignore, 4=xmlcharrefreplace */
8799 int known_errorHandler = -1;
8800
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 PyErr_BadArgument();
8803 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 if (PyUnicode_READY(input) == -1)
8807 return NULL;
8808 idata = (char*)PyUnicode_DATA(input);
8809 kind = PyUnicode_KIND(input);
8810 size = PyUnicode_GET_LENGTH(input);
8811 i = 0;
8812
8813 if (size == 0) {
8814 Py_INCREF(input);
8815 return input;
8816 }
8817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818 /* allocate enough for a simple 1:1 translation without
8819 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 osize = size;
8821 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8822 opos = 0;
8823 if (output == NULL) {
8824 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 /* try to encode it */
8830 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 if (charmaptranslate_output(input, i, mapping,
8832 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 Py_XDECREF(x);
8834 goto onError;
8835 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008836 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 else { /* untranslatable character */
8840 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8841 Py_ssize_t repsize;
8842 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 Py_ssize_t collstart = i;
8846 Py_ssize_t collend = i+1;
8847 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 while (collend < size) {
8851 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 goto onError;
8853 Py_XDECREF(x);
8854 if (x!=Py_None)
8855 break;
8856 ++collend;
8857 }
8858 /* cache callback name lookup
8859 * (if not done yet, i.e. it's the first error) */
8860 if (known_errorHandler==-1) {
8861 if ((errors==NULL) || (!strcmp(errors, "strict")))
8862 known_errorHandler = 1;
8863 else if (!strcmp(errors, "replace"))
8864 known_errorHandler = 2;
8865 else if (!strcmp(errors, "ignore"))
8866 known_errorHandler = 3;
8867 else if (!strcmp(errors, "xmlcharrefreplace"))
8868 known_errorHandler = 4;
8869 else
8870 known_errorHandler = 0;
8871 }
8872 switch (known_errorHandler) {
8873 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 raise_translate_exception(&exc, input, collstart,
8875 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008876 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 case 2: /* replace */
8878 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 for (coll = collstart; coll<collend; coll++)
8880 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 /* fall through */
8882 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 break;
8885 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 /* generate replacement (temporarily (mis)uses i) */
8887 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 char buffer[2+29+1+1];
8889 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8891 if (charmaptranslate_makespace(&output, &osize,
8892 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 goto onError;
8894 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 break;
8899 default:
8900 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 reason, input, &exc,
8902 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008903 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008905 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008906 Py_DECREF(repunicode);
8907 goto onError;
8908 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 repsize = PyUnicode_GET_LENGTH(repunicode);
8911 if (charmaptranslate_makespace(&output, &osize,
8912 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 Py_DECREF(repunicode);
8914 goto onError;
8915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 for (uni2 = 0; repsize-->0; ++uni2)
8917 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8918 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008920 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008921 }
8922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8924 if (!res)
8925 goto onError;
8926 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008927 Py_XDECREF(exc);
8928 Py_XDECREF(errorHandler);
8929 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008933 Py_XDECREF(exc);
8934 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 return NULL;
8936}
8937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938/* Deprecated. Use PyUnicode_Translate instead. */
8939PyObject *
8940PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8941 Py_ssize_t size,
8942 PyObject *mapping,
8943 const char *errors)
8944{
8945 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8946 if (!unicode)
8947 return NULL;
8948 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8949}
8950
Alexander Belopolsky40018472011-02-26 01:02:56 +00008951PyObject *
8952PyUnicode_Translate(PyObject *str,
8953 PyObject *mapping,
8954 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955{
8956 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008957
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 str = PyUnicode_FromObject(str);
8959 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 Py_DECREF(str);
8963 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008964
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 Py_XDECREF(str);
8967 return NULL;
8968}
Tim Petersced69f82003-09-16 20:30:58 +00008969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008971fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972{
8973 /* No need to call PyUnicode_READY(self) because this function is only
8974 called as a callback from fixup() which does it already. */
8975 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8976 const int kind = PyUnicode_KIND(self);
8977 void *data = PyUnicode_DATA(self);
8978 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008979 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 Py_ssize_t i;
8981
8982 for (i = 0; i < len; ++i) {
8983 ch = PyUnicode_READ(kind, data, i);
8984 fixed = 0;
8985 if (ch > 127) {
8986 if (Py_UNICODE_ISSPACE(ch))
8987 fixed = ' ';
8988 else {
8989 const int decimal = Py_UNICODE_TODECIMAL(ch);
8990 if (decimal >= 0)
8991 fixed = '0' + decimal;
8992 }
8993 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008994 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 if (fixed > maxchar)
8996 maxchar = fixed;
8997 PyUnicode_WRITE(kind, data, i, fixed);
8998 }
8999 else if (ch > maxchar)
9000 maxchar = ch;
9001 }
9002 else if (ch > maxchar)
9003 maxchar = ch;
9004 }
9005
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009006 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007}
9008
9009PyObject *
9010_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9011{
9012 if (!PyUnicode_Check(unicode)) {
9013 PyErr_BadInternalCall();
9014 return NULL;
9015 }
9016 if (PyUnicode_READY(unicode) == -1)
9017 return NULL;
9018 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9019 /* If the string is already ASCII, just return the same string */
9020 Py_INCREF(unicode);
9021 return unicode;
9022 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009023 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024}
9025
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009026PyObject *
9027PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9028 Py_ssize_t length)
9029{
Victor Stinnerf0124502011-11-21 23:12:56 +01009030 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009031 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009032 Py_UCS4 maxchar;
9033 enum PyUnicode_Kind kind;
9034 void *data;
9035
Victor Stinner99d7ad02012-02-22 13:37:39 +01009036 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009037 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009038 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009039 if (ch > 127) {
9040 int decimal = Py_UNICODE_TODECIMAL(ch);
9041 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009042 ch = '0' + decimal;
Victor Stinner99d7ad02012-02-22 13:37:39 +01009043 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009044 }
9045 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009046
9047 /* Copy to a new string */
9048 decimal = PyUnicode_New(length, maxchar);
9049 if (decimal == NULL)
9050 return decimal;
9051 kind = PyUnicode_KIND(decimal);
9052 data = PyUnicode_DATA(decimal);
9053 /* Iterate over code points */
9054 for (i = 0; i < length; i++) {
9055 Py_UNICODE ch = s[i];
9056 if (ch > 127) {
9057 int decimal = Py_UNICODE_TODECIMAL(ch);
9058 if (decimal >= 0)
9059 ch = '0' + decimal;
9060 }
9061 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009063 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009064}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009065/* --- Decimal Encoder ---------------------------------------------------- */
9066
Alexander Belopolsky40018472011-02-26 01:02:56 +00009067int
9068PyUnicode_EncodeDecimal(Py_UNICODE *s,
9069 Py_ssize_t length,
9070 char *output,
9071 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009072{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009073 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009074 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009075 enum PyUnicode_Kind kind;
9076 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009077
9078 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009079 PyErr_BadArgument();
9080 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009081 }
9082
Victor Stinner42bf7752011-11-21 22:52:58 +01009083 unicode = PyUnicode_FromUnicode(s, length);
9084 if (unicode == NULL)
9085 return -1;
9086
Benjamin Petersonbac79492012-01-14 13:34:47 -05009087 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009088 Py_DECREF(unicode);
9089 return -1;
9090 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009091 kind = PyUnicode_KIND(unicode);
9092 data = PyUnicode_DATA(unicode);
9093
Victor Stinnerb84d7232011-11-22 01:50:07 +01009094 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009095 PyObject *exc;
9096 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009098 Py_ssize_t startpos;
9099
9100 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009101
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009103 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009104 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009106 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 decimal = Py_UNICODE_TODECIMAL(ch);
9108 if (decimal >= 0) {
9109 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009110 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 continue;
9112 }
9113 if (0 < ch && ch < 256) {
9114 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009115 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009116 continue;
9117 }
Victor Stinner6345be92011-11-25 20:09:01 +01009118
Victor Stinner42bf7752011-11-21 22:52:58 +01009119 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009120 exc = NULL;
9121 raise_encode_exception(&exc, "decimal", unicode,
9122 startpos, startpos+1,
9123 "invalid decimal Unicode string");
9124 Py_XDECREF(exc);
9125 Py_DECREF(unicode);
9126 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009127 }
9128 /* 0-terminate the output string */
9129 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009130 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009131 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009132}
9133
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134/* --- Helpers ------------------------------------------------------------ */
9135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009137any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 Py_ssize_t start,
9139 Py_ssize_t end)
9140{
9141 int kind1, kind2, kind;
9142 void *buf1, *buf2;
9143 Py_ssize_t len1, len2, result;
9144
9145 kind1 = PyUnicode_KIND(s1);
9146 kind2 = PyUnicode_KIND(s2);
9147 kind = kind1 > kind2 ? kind1 : kind2;
9148 buf1 = PyUnicode_DATA(s1);
9149 buf2 = PyUnicode_DATA(s2);
9150 if (kind1 != kind)
9151 buf1 = _PyUnicode_AsKind(s1, kind);
9152 if (!buf1)
9153 return -2;
9154 if (kind2 != kind)
9155 buf2 = _PyUnicode_AsKind(s2, kind);
9156 if (!buf2) {
9157 if (kind1 != kind) PyMem_Free(buf1);
9158 return -2;
9159 }
9160 len1 = PyUnicode_GET_LENGTH(s1);
9161 len2 = PyUnicode_GET_LENGTH(s2);
9162
Victor Stinner794d5672011-10-10 03:21:36 +02009163 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009164 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009165 case PyUnicode_1BYTE_KIND:
9166 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9167 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9168 else
9169 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9170 break;
9171 case PyUnicode_2BYTE_KIND:
9172 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9173 break;
9174 case PyUnicode_4BYTE_KIND:
9175 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9176 break;
9177 default:
9178 assert(0); result = -2;
9179 }
9180 }
9181 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009182 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009183 case PyUnicode_1BYTE_KIND:
9184 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9185 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9186 else
9187 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9188 break;
9189 case PyUnicode_2BYTE_KIND:
9190 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9191 break;
9192 case PyUnicode_4BYTE_KIND:
9193 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9194 break;
9195 default:
9196 assert(0); result = -2;
9197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 }
9199
9200 if (kind1 != kind)
9201 PyMem_Free(buf1);
9202 if (kind2 != kind)
9203 PyMem_Free(buf2);
9204
9205 return result;
9206}
9207
9208Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009209_PyUnicode_InsertThousandsGrouping(
9210 PyObject *unicode, Py_ssize_t index,
9211 Py_ssize_t n_buffer,
9212 void *digits, Py_ssize_t n_digits,
9213 Py_ssize_t min_width,
9214 const char *grouping, PyObject *thousands_sep,
9215 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216{
Victor Stinner41a863c2012-02-24 00:37:51 +01009217 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009218 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009219 Py_ssize_t thousands_sep_len;
9220 Py_ssize_t len;
9221
9222 if (unicode != NULL) {
9223 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009224 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009225 }
9226 else {
9227 kind = PyUnicode_1BYTE_KIND;
9228 data = NULL;
9229 }
9230 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9231 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9232 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9233 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009234 if (thousands_sep_kind < kind) {
9235 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9236 if (!thousands_sep_data)
9237 return -1;
9238 }
9239 else {
9240 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9241 if (!data)
9242 return -1;
9243 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009244 }
9245
Benjamin Petersonead6b532011-12-20 17:23:42 -06009246 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009248 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009249 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009250 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009251 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009252 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009253 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009254 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009255 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009256 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009257 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009258 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009260 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009261 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009262 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009263 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009264 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009266 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009267 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009268 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009269 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009270 break;
9271 default:
9272 assert(0);
9273 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009275 if (unicode != NULL && thousands_sep_kind != kind) {
9276 if (thousands_sep_kind < kind)
9277 PyMem_Free(thousands_sep_data);
9278 else
9279 PyMem_Free(data);
9280 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009281 if (unicode == NULL) {
9282 *maxchar = 127;
9283 if (len != n_digits) {
9284 *maxchar = Py_MAX(*maxchar,
9285 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9286 }
9287 }
9288 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289}
9290
9291
Thomas Wouters477c8d52006-05-27 19:21:47 +00009292/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009293#define ADJUST_INDICES(start, end, len) \
9294 if (end > len) \
9295 end = len; \
9296 else if (end < 0) { \
9297 end += len; \
9298 if (end < 0) \
9299 end = 0; \
9300 } \
9301 if (start < 0) { \
9302 start += len; \
9303 if (start < 0) \
9304 start = 0; \
9305 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009306
Alexander Belopolsky40018472011-02-26 01:02:56 +00009307Py_ssize_t
9308PyUnicode_Count(PyObject *str,
9309 PyObject *substr,
9310 Py_ssize_t start,
9311 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009313 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009314 PyObject* str_obj;
9315 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 int kind1, kind2, kind;
9317 void *buf1 = NULL, *buf2 = NULL;
9318 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009319
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009320 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009321 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009323 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009324 if (!sub_obj) {
9325 Py_DECREF(str_obj);
9326 return -1;
9327 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009328 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009329 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009330 Py_DECREF(str_obj);
9331 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332 }
Tim Petersced69f82003-09-16 20:30:58 +00009333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 kind1 = PyUnicode_KIND(str_obj);
9335 kind2 = PyUnicode_KIND(sub_obj);
9336 kind = kind1 > kind2 ? kind1 : kind2;
9337 buf1 = PyUnicode_DATA(str_obj);
9338 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009339 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 if (!buf1)
9341 goto onError;
9342 buf2 = PyUnicode_DATA(sub_obj);
9343 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009344 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 if (!buf2)
9346 goto onError;
9347 len1 = PyUnicode_GET_LENGTH(str_obj);
9348 len2 = PyUnicode_GET_LENGTH(sub_obj);
9349
9350 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009351 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009353 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9354 result = asciilib_count(
9355 ((Py_UCS1*)buf1) + start, end - start,
9356 buf2, len2, PY_SSIZE_T_MAX
9357 );
9358 else
9359 result = ucs1lib_count(
9360 ((Py_UCS1*)buf1) + start, end - start,
9361 buf2, len2, PY_SSIZE_T_MAX
9362 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 break;
9364 case PyUnicode_2BYTE_KIND:
9365 result = ucs2lib_count(
9366 ((Py_UCS2*)buf1) + start, end - start,
9367 buf2, len2, PY_SSIZE_T_MAX
9368 );
9369 break;
9370 case PyUnicode_4BYTE_KIND:
9371 result = ucs4lib_count(
9372 ((Py_UCS4*)buf1) + start, end - start,
9373 buf2, len2, PY_SSIZE_T_MAX
9374 );
9375 break;
9376 default:
9377 assert(0); result = 0;
9378 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009379
9380 Py_DECREF(sub_obj);
9381 Py_DECREF(str_obj);
9382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 if (kind1 != kind)
9384 PyMem_Free(buf1);
9385 if (kind2 != kind)
9386 PyMem_Free(buf2);
9387
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 onError:
9390 Py_DECREF(sub_obj);
9391 Py_DECREF(str_obj);
9392 if (kind1 != kind && buf1)
9393 PyMem_Free(buf1);
9394 if (kind2 != kind && buf2)
9395 PyMem_Free(buf2);
9396 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397}
9398
Alexander Belopolsky40018472011-02-26 01:02:56 +00009399Py_ssize_t
9400PyUnicode_Find(PyObject *str,
9401 PyObject *sub,
9402 Py_ssize_t start,
9403 Py_ssize_t end,
9404 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009406 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009407
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009409 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009411 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009412 if (!sub) {
9413 Py_DECREF(str);
9414 return -2;
9415 }
9416 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9417 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 Py_DECREF(str);
9419 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420 }
Tim Petersced69f82003-09-16 20:30:58 +00009421
Victor Stinner794d5672011-10-10 03:21:36 +02009422 result = any_find_slice(direction,
9423 str, sub, start, end
9424 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009425
Guido van Rossumd57fd912000-03-10 22:53:23 +00009426 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009427 Py_DECREF(sub);
9428
Guido van Rossumd57fd912000-03-10 22:53:23 +00009429 return result;
9430}
9431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432Py_ssize_t
9433PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9434 Py_ssize_t start, Py_ssize_t end,
9435 int direction)
9436{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009438 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 if (PyUnicode_READY(str) == -1)
9440 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009441 if (start < 0 || end < 0) {
9442 PyErr_SetString(PyExc_IndexError, "string index out of range");
9443 return -2;
9444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 if (end > PyUnicode_GET_LENGTH(str))
9446 end = PyUnicode_GET_LENGTH(str);
9447 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009448 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9449 kind, end-start, ch, direction);
9450 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009452 else
9453 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454}
9455
Alexander Belopolsky40018472011-02-26 01:02:56 +00009456static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009457tailmatch(PyObject *self,
9458 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009459 Py_ssize_t start,
9460 Py_ssize_t end,
9461 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 int kind_self;
9464 int kind_sub;
9465 void *data_self;
9466 void *data_sub;
9467 Py_ssize_t offset;
9468 Py_ssize_t i;
9469 Py_ssize_t end_sub;
9470
9471 if (PyUnicode_READY(self) == -1 ||
9472 PyUnicode_READY(substring) == -1)
9473 return 0;
9474
9475 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 return 1;
9477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9479 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009481 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 kind_self = PyUnicode_KIND(self);
9484 data_self = PyUnicode_DATA(self);
9485 kind_sub = PyUnicode_KIND(substring);
9486 data_sub = PyUnicode_DATA(substring);
9487 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9488
9489 if (direction > 0)
9490 offset = end;
9491 else
9492 offset = start;
9493
9494 if (PyUnicode_READ(kind_self, data_self, offset) ==
9495 PyUnicode_READ(kind_sub, data_sub, 0) &&
9496 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9497 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9498 /* If both are of the same kind, memcmp is sufficient */
9499 if (kind_self == kind_sub) {
9500 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009501 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 data_sub,
9503 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009504 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 }
9506 /* otherwise we have to compare each character by first accesing it */
9507 else {
9508 /* We do not need to compare 0 and len(substring)-1 because
9509 the if statement above ensured already that they are equal
9510 when we end up here. */
9511 // TODO: honor direction and do a forward or backwards search
9512 for (i = 1; i < end_sub; ++i) {
9513 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9514 PyUnicode_READ(kind_sub, data_sub, i))
9515 return 0;
9516 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 }
9520
9521 return 0;
9522}
9523
Alexander Belopolsky40018472011-02-26 01:02:56 +00009524Py_ssize_t
9525PyUnicode_Tailmatch(PyObject *str,
9526 PyObject *substr,
9527 Py_ssize_t start,
9528 Py_ssize_t end,
9529 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009531 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009532
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 str = PyUnicode_FromObject(str);
9534 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536 substr = PyUnicode_FromObject(substr);
9537 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 Py_DECREF(str);
9539 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540 }
Tim Petersced69f82003-09-16 20:30:58 +00009541
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009542 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544 Py_DECREF(str);
9545 Py_DECREF(substr);
9546 return result;
9547}
9548
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549/* Apply fixfct filter to the Unicode object self and return a
9550 reference to the modified object */
9551
Alexander Belopolsky40018472011-02-26 01:02:56 +00009552static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009553fixup(PyObject *self,
9554 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 PyObject *u;
9557 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009558 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009560 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009563 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 /* fix functions return the new maximum character in a string,
9566 if the kind of the resulting unicode object does not change,
9567 everything is fine. Otherwise we need to change the string kind
9568 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009569 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009570
9571 if (maxchar_new == 0) {
9572 /* no changes */;
9573 if (PyUnicode_CheckExact(self)) {
9574 Py_DECREF(u);
9575 Py_INCREF(self);
9576 return self;
9577 }
9578 else
9579 return u;
9580 }
9581
9582 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 maxchar_new = 127;
9584 else if (maxchar_new <= 255)
9585 maxchar_new = 255;
9586 else if (maxchar_new <= 65535)
9587 maxchar_new = 65535;
9588 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009589 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590
Victor Stinnereaab6042011-12-11 22:22:39 +01009591 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009593
9594 /* In case the maximum character changed, we need to
9595 convert the string to the new category. */
9596 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9597 if (v == NULL) {
9598 Py_DECREF(u);
9599 return NULL;
9600 }
9601 if (maxchar_new > maxchar_old) {
9602 /* If the maxchar increased so that the kind changed, not all
9603 characters are representable anymore and we need to fix the
9604 string again. This only happens in very few cases. */
9605 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9606 maxchar_old = fixfct(v);
9607 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 }
9609 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009610 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009612 Py_DECREF(u);
9613 assert(_PyUnicode_CheckConsistency(v, 1));
9614 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615}
9616
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009617static PyObject *
9618ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009620 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9621 char *resdata, *data = PyUnicode_DATA(self);
9622 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009623
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009624 res = PyUnicode_New(len, 127);
9625 if (res == NULL)
9626 return NULL;
9627 resdata = PyUnicode_DATA(res);
9628 if (lower)
9629 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009631 _Py_bytes_upper(resdata, data, len);
9632 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633}
9634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009636handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009638 Py_ssize_t j;
9639 int final_sigma;
9640 Py_UCS4 c;
9641 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009642
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9644
9645 where ! is a negation and \p{xxx} is a character with property xxx.
9646 */
9647 for (j = i - 1; j >= 0; j--) {
9648 c = PyUnicode_READ(kind, data, j);
9649 if (!_PyUnicode_IsCaseIgnorable(c))
9650 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9653 if (final_sigma) {
9654 for (j = i + 1; j < length; j++) {
9655 c = PyUnicode_READ(kind, data, j);
9656 if (!_PyUnicode_IsCaseIgnorable(c))
9657 break;
9658 }
9659 final_sigma = j == length || !_PyUnicode_IsCased(c);
9660 }
9661 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662}
9663
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664static int
9665lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9666 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 /* Obscure special case. */
9669 if (c == 0x3A3) {
9670 mapped[0] = handle_capital_sigma(kind, data, length, i);
9671 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674}
9675
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676static Py_ssize_t
9677do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679 Py_ssize_t i, k = 0;
9680 int n_res, j;
9681 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009682
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 c = PyUnicode_READ(kind, data, 0);
9684 n_res = _PyUnicode_ToUpperFull(c, mapped);
9685 for (j = 0; j < n_res; j++) {
9686 if (mapped[j] > *maxchar)
9687 *maxchar = mapped[j];
9688 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 for (i = 1; i < length; i++) {
9691 c = PyUnicode_READ(kind, data, i);
9692 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9693 for (j = 0; j < n_res; j++) {
9694 if (mapped[j] > *maxchar)
9695 *maxchar = mapped[j];
9696 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009697 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009698 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009699 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700}
9701
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009702static Py_ssize_t
9703do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9704 Py_ssize_t i, k = 0;
9705
9706 for (i = 0; i < length; i++) {
9707 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9708 int n_res, j;
9709 if (Py_UNICODE_ISUPPER(c)) {
9710 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9711 }
9712 else if (Py_UNICODE_ISLOWER(c)) {
9713 n_res = _PyUnicode_ToUpperFull(c, mapped);
9714 }
9715 else {
9716 n_res = 1;
9717 mapped[0] = c;
9718 }
9719 for (j = 0; j < n_res; j++) {
9720 if (mapped[j] > *maxchar)
9721 *maxchar = mapped[j];
9722 res[k++] = mapped[j];
9723 }
9724 }
9725 return k;
9726}
9727
9728static Py_ssize_t
9729do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9730 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732 Py_ssize_t i, k = 0;
9733
9734 for (i = 0; i < length; i++) {
9735 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9736 int n_res, j;
9737 if (lower)
9738 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9739 else
9740 n_res = _PyUnicode_ToUpperFull(c, mapped);
9741 for (j = 0; j < n_res; j++) {
9742 if (mapped[j] > *maxchar)
9743 *maxchar = mapped[j];
9744 res[k++] = mapped[j];
9745 }
9746 }
9747 return k;
9748}
9749
9750static Py_ssize_t
9751do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9752{
9753 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9754}
9755
9756static Py_ssize_t
9757do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9758{
9759 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9760}
9761
Benjamin Petersone51757f2012-01-12 21:10:29 -05009762static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009763do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9764{
9765 Py_ssize_t i, k = 0;
9766
9767 for (i = 0; i < length; i++) {
9768 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9769 Py_UCS4 mapped[3];
9770 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9771 for (j = 0; j < n_res; j++) {
9772 if (mapped[j] > *maxchar)
9773 *maxchar = mapped[j];
9774 res[k++] = mapped[j];
9775 }
9776 }
9777 return k;
9778}
9779
9780static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009781do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9782{
9783 Py_ssize_t i, k = 0;
9784 int previous_is_cased;
9785
9786 previous_is_cased = 0;
9787 for (i = 0; i < length; i++) {
9788 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9789 Py_UCS4 mapped[3];
9790 int n_res, j;
9791
9792 if (previous_is_cased)
9793 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9794 else
9795 n_res = _PyUnicode_ToTitleFull(c, mapped);
9796
9797 for (j = 0; j < n_res; j++) {
9798 if (mapped[j] > *maxchar)
9799 *maxchar = mapped[j];
9800 res[k++] = mapped[j];
9801 }
9802
9803 previous_is_cased = _PyUnicode_IsCased(c);
9804 }
9805 return k;
9806}
9807
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009808static PyObject *
9809case_operation(PyObject *self,
9810 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9811{
9812 PyObject *res = NULL;
9813 Py_ssize_t length, newlength = 0;
9814 int kind, outkind;
9815 void *data, *outdata;
9816 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9817
Benjamin Petersoneea48462012-01-16 14:28:50 -05009818 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009819
9820 kind = PyUnicode_KIND(self);
9821 data = PyUnicode_DATA(self);
9822 length = PyUnicode_GET_LENGTH(self);
9823 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9824 if (tmp == NULL)
9825 return PyErr_NoMemory();
9826 newlength = perform(kind, data, length, tmp, &maxchar);
9827 res = PyUnicode_New(newlength, maxchar);
9828 if (res == NULL)
9829 goto leave;
9830 tmpend = tmp + newlength;
9831 outdata = PyUnicode_DATA(res);
9832 outkind = PyUnicode_KIND(res);
9833 switch (outkind) {
9834 case PyUnicode_1BYTE_KIND:
9835 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9836 break;
9837 case PyUnicode_2BYTE_KIND:
9838 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9839 break;
9840 case PyUnicode_4BYTE_KIND:
9841 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9842 break;
9843 default:
9844 assert(0);
9845 break;
9846 }
9847 leave:
9848 PyMem_FREE(tmp);
9849 return res;
9850}
9851
Tim Peters8ce9f162004-08-27 01:49:32 +00009852PyObject *
9853PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009856 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009858 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009859 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9860 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009861 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009863 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009865 int use_memcpy;
9866 unsigned char *res_data = NULL, *sep_data = NULL;
9867 PyObject *last_obj;
9868 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869
Tim Peters05eba1f2004-08-27 21:32:02 +00009870 fseq = PySequence_Fast(seq, "");
9871 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009872 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009873 }
9874
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009875 /* NOTE: the following code can't call back into Python code,
9876 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009877 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009878
Tim Peters05eba1f2004-08-27 21:32:02 +00009879 seqlen = PySequence_Fast_GET_SIZE(fseq);
9880 /* If empty sequence, return u"". */
9881 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009882 Py_DECREF(fseq);
9883 Py_INCREF(unicode_empty);
9884 res = unicode_empty;
9885 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009886 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009887
Tim Peters05eba1f2004-08-27 21:32:02 +00009888 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009889 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009890 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009891 if (seqlen == 1) {
9892 if (PyUnicode_CheckExact(items[0])) {
9893 res = items[0];
9894 Py_INCREF(res);
9895 Py_DECREF(fseq);
9896 return res;
9897 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009898 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009899 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009900 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009901 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009902 /* Set up sep and seplen */
9903 if (separator == NULL) {
9904 /* fall back to a blank space separator */
9905 sep = PyUnicode_FromOrdinal(' ');
9906 if (!sep)
9907 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009908 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009909 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009910 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009911 else {
9912 if (!PyUnicode_Check(separator)) {
9913 PyErr_Format(PyExc_TypeError,
9914 "separator: expected str instance,"
9915 " %.80s found",
9916 Py_TYPE(separator)->tp_name);
9917 goto onError;
9918 }
9919 if (PyUnicode_READY(separator))
9920 goto onError;
9921 sep = separator;
9922 seplen = PyUnicode_GET_LENGTH(separator);
9923 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9924 /* inc refcount to keep this code path symmetric with the
9925 above case of a blank separator */
9926 Py_INCREF(sep);
9927 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009928 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009929 }
9930
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009931 /* There are at least two things to join, or else we have a subclass
9932 * of str in the sequence.
9933 * Do a pre-pass to figure out the total amount of space we'll
9934 * need (sz), and see whether all argument are strings.
9935 */
9936 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009937#ifdef Py_DEBUG
9938 use_memcpy = 0;
9939#else
9940 use_memcpy = 1;
9941#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 for (i = 0; i < seqlen; i++) {
9943 const Py_ssize_t old_sz = sz;
9944 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009945 if (!PyUnicode_Check(item)) {
9946 PyErr_Format(PyExc_TypeError,
9947 "sequence item %zd: expected str instance,"
9948 " %.80s found",
9949 i, Py_TYPE(item)->tp_name);
9950 goto onError;
9951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 if (PyUnicode_READY(item) == -1)
9953 goto onError;
9954 sz += PyUnicode_GET_LENGTH(item);
9955 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009956 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009957 if (i != 0)
9958 sz += seplen;
9959 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9960 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009961 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009962 goto onError;
9963 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009964 if (use_memcpy && last_obj != NULL) {
9965 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9966 use_memcpy = 0;
9967 }
9968 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009969 }
Tim Petersced69f82003-09-16 20:30:58 +00009970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009972 if (res == NULL)
9973 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009974
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009975 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009976#ifdef Py_DEBUG
9977 use_memcpy = 0;
9978#else
9979 if (use_memcpy) {
9980 res_data = PyUnicode_1BYTE_DATA(res);
9981 kind = PyUnicode_KIND(res);
9982 if (seplen != 0)
9983 sep_data = PyUnicode_1BYTE_DATA(sep);
9984 }
9985#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009987 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009988 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009990 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009991 if (use_memcpy) {
9992 Py_MEMCPY(res_data,
9993 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009994 kind * seplen);
9995 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009996 }
9997 else {
9998 copy_characters(res, res_offset, sep, 0, seplen);
9999 res_offset += seplen;
10000 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010001 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010002 itemlen = PyUnicode_GET_LENGTH(item);
10003 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010004 if (use_memcpy) {
10005 Py_MEMCPY(res_data,
10006 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010007 kind * itemlen);
10008 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010009 }
10010 else {
10011 copy_characters(res, res_offset, item, 0, itemlen);
10012 res_offset += itemlen;
10013 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010014 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010015 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010016 if (use_memcpy)
10017 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010018 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010019 else
10020 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010021
Tim Peters05eba1f2004-08-27 21:32:02 +000010022 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010024 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026
Benjamin Peterson29060642009-01-31 22:14:21 +000010027 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010028 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010030 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031 return NULL;
10032}
10033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034#define FILL(kind, data, value, start, length) \
10035 do { \
10036 Py_ssize_t i_ = 0; \
10037 assert(kind != PyUnicode_WCHAR_KIND); \
10038 switch ((kind)) { \
10039 case PyUnicode_1BYTE_KIND: { \
10040 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10041 memset(to_, (unsigned char)value, length); \
10042 break; \
10043 } \
10044 case PyUnicode_2BYTE_KIND: { \
10045 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10046 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10047 break; \
10048 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010049 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10051 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10052 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010053 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 } \
10055 } \
10056 } while (0)
10057
Victor Stinner3fe55312012-01-04 00:33:50 +010010058Py_ssize_t
10059PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10060 Py_UCS4 fill_char)
10061{
10062 Py_ssize_t maxlen;
10063 enum PyUnicode_Kind kind;
10064 void *data;
10065
10066 if (!PyUnicode_Check(unicode)) {
10067 PyErr_BadInternalCall();
10068 return -1;
10069 }
10070 if (PyUnicode_READY(unicode) == -1)
10071 return -1;
10072 if (unicode_check_modifiable(unicode))
10073 return -1;
10074
10075 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10076 PyErr_SetString(PyExc_ValueError,
10077 "fill character is bigger than "
10078 "the string maximum character");
10079 return -1;
10080 }
10081
10082 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10083 length = Py_MIN(maxlen, length);
10084 if (length <= 0)
10085 return 0;
10086
10087 kind = PyUnicode_KIND(unicode);
10088 data = PyUnicode_DATA(unicode);
10089 FILL(kind, data, fill_char, start, length);
10090 return length;
10091}
10092
Victor Stinner9310abb2011-10-05 00:59:23 +020010093static PyObject *
10094pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010095 Py_ssize_t left,
10096 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 PyObject *u;
10100 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010101 int kind;
10102 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103
10104 if (left < 0)
10105 left = 0;
10106 if (right < 0)
10107 right = 0;
10108
Victor Stinnerc4b49542011-12-11 22:44:26 +010010109 if (left == 0 && right == 0)
10110 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10113 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010114 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10115 return NULL;
10116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10118 if (fill > maxchar)
10119 maxchar = fill;
10120 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010121 if (!u)
10122 return NULL;
10123
10124 kind = PyUnicode_KIND(u);
10125 data = PyUnicode_DATA(u);
10126 if (left)
10127 FILL(kind, data, fill, 0, left);
10128 if (right)
10129 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010130 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010131 assert(_PyUnicode_CheckConsistency(u, 1));
10132 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133}
10134
Alexander Belopolsky40018472011-02-26 01:02:56 +000010135PyObject *
10136PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
10140 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010141 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010142 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010143 if (PyUnicode_READY(string) == -1) {
10144 Py_DECREF(string);
10145 return NULL;
10146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147
Benjamin Petersonead6b532011-12-20 17:23:42 -060010148 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 if (PyUnicode_IS_ASCII(string))
10151 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010152 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010153 PyUnicode_GET_LENGTH(string), keepends);
10154 else
10155 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010156 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010157 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 break;
10159 case PyUnicode_2BYTE_KIND:
10160 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010161 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 PyUnicode_GET_LENGTH(string), keepends);
10163 break;
10164 case PyUnicode_4BYTE_KIND:
10165 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010166 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 PyUnicode_GET_LENGTH(string), keepends);
10168 break;
10169 default:
10170 assert(0);
10171 list = 0;
10172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173 Py_DECREF(string);
10174 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175}
10176
Alexander Belopolsky40018472011-02-26 01:02:56 +000010177static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010178split(PyObject *self,
10179 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010180 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 int kind1, kind2, kind;
10183 void *buf1, *buf2;
10184 Py_ssize_t len1, len2;
10185 PyObject* out;
10186
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010188 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 if (PyUnicode_READY(self) == -1)
10191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010194 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010196 if (PyUnicode_IS_ASCII(self))
10197 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010198 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199 PyUnicode_GET_LENGTH(self), maxcount
10200 );
10201 else
10202 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010203 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 PyUnicode_GET_LENGTH(self), maxcount
10205 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 case PyUnicode_2BYTE_KIND:
10207 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010208 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 PyUnicode_GET_LENGTH(self), maxcount
10210 );
10211 case PyUnicode_4BYTE_KIND:
10212 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010213 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 PyUnicode_GET_LENGTH(self), maxcount
10215 );
10216 default:
10217 assert(0);
10218 return NULL;
10219 }
10220
10221 if (PyUnicode_READY(substring) == -1)
10222 return NULL;
10223
10224 kind1 = PyUnicode_KIND(self);
10225 kind2 = PyUnicode_KIND(substring);
10226 kind = kind1 > kind2 ? kind1 : kind2;
10227 buf1 = PyUnicode_DATA(self);
10228 buf2 = PyUnicode_DATA(substring);
10229 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010230 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 if (!buf1)
10232 return NULL;
10233 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010234 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (!buf2) {
10236 if (kind1 != kind) PyMem_Free(buf1);
10237 return NULL;
10238 }
10239 len1 = PyUnicode_GET_LENGTH(self);
10240 len2 = PyUnicode_GET_LENGTH(substring);
10241
Benjamin Petersonead6b532011-12-20 17:23:42 -060010242 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010244 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10245 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010246 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010247 else
10248 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010249 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 break;
10251 case PyUnicode_2BYTE_KIND:
10252 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010253 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 break;
10255 case PyUnicode_4BYTE_KIND:
10256 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 break;
10259 default:
10260 out = NULL;
10261 }
10262 if (kind1 != kind)
10263 PyMem_Free(buf1);
10264 if (kind2 != kind)
10265 PyMem_Free(buf2);
10266 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267}
10268
Alexander Belopolsky40018472011-02-26 01:02:56 +000010269static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010270rsplit(PyObject *self,
10271 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010272 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 int kind1, kind2, kind;
10275 void *buf1, *buf2;
10276 Py_ssize_t len1, len2;
10277 PyObject* out;
10278
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010279 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010280 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (PyUnicode_READY(self) == -1)
10283 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010286 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010288 if (PyUnicode_IS_ASCII(self))
10289 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010291 PyUnicode_GET_LENGTH(self), maxcount
10292 );
10293 else
10294 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010296 PyUnicode_GET_LENGTH(self), maxcount
10297 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 case PyUnicode_2BYTE_KIND:
10299 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010300 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 PyUnicode_GET_LENGTH(self), maxcount
10302 );
10303 case PyUnicode_4BYTE_KIND:
10304 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010305 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 PyUnicode_GET_LENGTH(self), maxcount
10307 );
10308 default:
10309 assert(0);
10310 return NULL;
10311 }
10312
10313 if (PyUnicode_READY(substring) == -1)
10314 return NULL;
10315
10316 kind1 = PyUnicode_KIND(self);
10317 kind2 = PyUnicode_KIND(substring);
10318 kind = kind1 > kind2 ? kind1 : kind2;
10319 buf1 = PyUnicode_DATA(self);
10320 buf2 = PyUnicode_DATA(substring);
10321 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010322 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (!buf1)
10324 return NULL;
10325 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010326 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 if (!buf2) {
10328 if (kind1 != kind) PyMem_Free(buf1);
10329 return NULL;
10330 }
10331 len1 = PyUnicode_GET_LENGTH(self);
10332 len2 = PyUnicode_GET_LENGTH(substring);
10333
Benjamin Petersonead6b532011-12-20 17:23:42 -060010334 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010336 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10337 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010338 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010339 else
10340 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 break;
10343 case PyUnicode_2BYTE_KIND:
10344 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 break;
10347 case PyUnicode_4BYTE_KIND:
10348 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010349 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 break;
10351 default:
10352 out = NULL;
10353 }
10354 if (kind1 != kind)
10355 PyMem_Free(buf1);
10356 if (kind2 != kind)
10357 PyMem_Free(buf2);
10358 return out;
10359}
10360
10361static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010362anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10363 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010365 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10368 return asciilib_find(buf1, len1, buf2, len2, offset);
10369 else
10370 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 case PyUnicode_2BYTE_KIND:
10372 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10373 case PyUnicode_4BYTE_KIND:
10374 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10375 }
10376 assert(0);
10377 return -1;
10378}
10379
10380static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010381anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10382 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010384 switch (kind) {
10385 case PyUnicode_1BYTE_KIND:
10386 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10387 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10388 else
10389 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10390 case PyUnicode_2BYTE_KIND:
10391 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10392 case PyUnicode_4BYTE_KIND:
10393 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10394 }
10395 assert(0);
10396 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010397}
10398
Alexander Belopolsky40018472011-02-26 01:02:56 +000010399static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400replace(PyObject *self, PyObject *str1,
10401 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 PyObject *u;
10404 char *sbuf = PyUnicode_DATA(self);
10405 char *buf1 = PyUnicode_DATA(str1);
10406 char *buf2 = PyUnicode_DATA(str2);
10407 int srelease = 0, release1 = 0, release2 = 0;
10408 int skind = PyUnicode_KIND(self);
10409 int kind1 = PyUnicode_KIND(str1);
10410 int kind2 = PyUnicode_KIND(str2);
10411 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10412 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10413 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010414 int mayshrink;
10415 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416
10417 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010420 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421
Victor Stinner59de0ee2011-10-07 10:01:28 +020010422 if (str1 == str2)
10423 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 if (skind < kind1)
10425 /* substring too wide to be present */
10426 goto nothing;
10427
Victor Stinner49a0a212011-10-12 23:46:10 +020010428 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10429 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10430 /* Replacing str1 with str2 may cause a maxchar reduction in the
10431 result string. */
10432 mayshrink = (maxchar_str2 < maxchar);
10433 maxchar = Py_MAX(maxchar, maxchar_str2);
10434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010436 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010438 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010440 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010441 Py_UCS4 u1, u2;
10442 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010443 Py_ssize_t index, pos;
10444 char *src;
10445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010447 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10448 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010452 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010454 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010456
10457 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10458 index = 0;
10459 src = sbuf;
10460 while (--maxcount)
10461 {
10462 pos++;
10463 src += pos * PyUnicode_KIND(self);
10464 slen -= pos;
10465 index += pos;
10466 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10467 if (pos < 0)
10468 break;
10469 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10470 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010471 }
10472 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 int rkind = skind;
10474 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010475 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 if (kind1 < rkind) {
10478 /* widen substring */
10479 buf1 = _PyUnicode_AsKind(str1, rkind);
10480 if (!buf1) goto error;
10481 release1 = 1;
10482 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010483 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 if (i < 0)
10485 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (rkind > kind2) {
10487 /* widen replacement */
10488 buf2 = _PyUnicode_AsKind(str2, rkind);
10489 if (!buf2) goto error;
10490 release2 = 1;
10491 }
10492 else if (rkind < kind2) {
10493 /* widen self and buf1 */
10494 rkind = kind2;
10495 if (release1) PyMem_Free(buf1);
10496 sbuf = _PyUnicode_AsKind(self, rkind);
10497 if (!sbuf) goto error;
10498 srelease = 1;
10499 buf1 = _PyUnicode_AsKind(str1, rkind);
10500 if (!buf1) goto error;
10501 release1 = 1;
10502 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010503 u = PyUnicode_New(slen, maxchar);
10504 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010506 assert(PyUnicode_KIND(u) == rkind);
10507 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010508
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010509 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010510 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010511 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010513 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010515
10516 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010517 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010518 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010520 if (i == -1)
10521 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010522 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010524 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010528 }
10529 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 Py_ssize_t n, i, j, ires;
10531 Py_ssize_t product, new_size;
10532 int rkind = skind;
10533 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010536 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 buf1 = _PyUnicode_AsKind(str1, rkind);
10538 if (!buf1) goto error;
10539 release1 = 1;
10540 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010541 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010542 if (n == 0)
10543 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010545 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 buf2 = _PyUnicode_AsKind(str2, rkind);
10547 if (!buf2) goto error;
10548 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010551 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 rkind = kind2;
10553 sbuf = _PyUnicode_AsKind(self, rkind);
10554 if (!sbuf) goto error;
10555 srelease = 1;
10556 if (release1) PyMem_Free(buf1);
10557 buf1 = _PyUnicode_AsKind(str1, rkind);
10558 if (!buf1) goto error;
10559 release1 = 1;
10560 }
10561 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10562 PyUnicode_GET_LENGTH(str1))); */
10563 product = n * (len2-len1);
10564 if ((product / (len2-len1)) != n) {
10565 PyErr_SetString(PyExc_OverflowError,
10566 "replace string is too long");
10567 goto error;
10568 }
10569 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010570 if (new_size == 0) {
10571 Py_INCREF(unicode_empty);
10572 u = unicode_empty;
10573 goto done;
10574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10576 PyErr_SetString(PyExc_OverflowError,
10577 "replace string is too long");
10578 goto error;
10579 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010580 u = PyUnicode_New(new_size, maxchar);
10581 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010583 assert(PyUnicode_KIND(u) == rkind);
10584 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 ires = i = 0;
10586 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010587 while (n-- > 0) {
10588 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010589 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010590 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010591 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010592 if (j == -1)
10593 break;
10594 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010595 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010596 memcpy(res + rkind * ires,
10597 sbuf + rkind * i,
10598 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010600 }
10601 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010603 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010605 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010612 memcpy(res + rkind * ires,
10613 sbuf + rkind * i,
10614 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010615 }
10616 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010617 /* interleave */
10618 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010619 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010621 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 if (--n <= 0)
10624 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010625 memcpy(res + rkind * ires,
10626 sbuf + rkind * i,
10627 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 ires++;
10629 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010630 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010631 memcpy(res + rkind * ires,
10632 sbuf + rkind * i,
10633 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010635 }
10636
10637 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010638 unicode_adjust_maxchar(&u);
10639 if (u == NULL)
10640 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010642
10643 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 if (srelease)
10645 PyMem_FREE(sbuf);
10646 if (release1)
10647 PyMem_FREE(buf1);
10648 if (release2)
10649 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010650 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652
Benjamin Peterson29060642009-01-31 22:14:21 +000010653 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (srelease)
10656 PyMem_FREE(sbuf);
10657 if (release1)
10658 PyMem_FREE(buf1);
10659 if (release2)
10660 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010661 return unicode_result_unchanged(self);
10662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 error:
10664 if (srelease && sbuf)
10665 PyMem_FREE(sbuf);
10666 if (release1 && buf1)
10667 PyMem_FREE(buf1);
10668 if (release2 && buf2)
10669 PyMem_FREE(buf2);
10670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671}
10672
10673/* --- Unicode Object Methods --------------------------------------------- */
10674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010675PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010676 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677\n\
10678Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010679characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680
10681static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010682unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010684 if (PyUnicode_READY(self) == -1)
10685 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010686 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687}
10688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010689PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691\n\
10692Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010693have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694
10695static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010696unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010698 if (PyUnicode_READY(self) == -1)
10699 return NULL;
10700 if (PyUnicode_GET_LENGTH(self) == 0)
10701 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010702 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703}
10704
Benjamin Petersond5890c82012-01-14 13:23:30 -050010705PyDoc_STRVAR(casefold__doc__,
10706 "S.casefold() -> str\n\
10707\n\
10708Return a version of S suitable for caseless comparisons.");
10709
10710static PyObject *
10711unicode_casefold(PyObject *self)
10712{
10713 if (PyUnicode_READY(self) == -1)
10714 return NULL;
10715 if (PyUnicode_IS_ASCII(self))
10716 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010717 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010718}
10719
10720
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010721/* Argument converter. Coerces to a single unicode character */
10722
10723static int
10724convert_uc(PyObject *obj, void *addr)
10725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010727 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010728
Benjamin Peterson14339b62009-01-31 16:36:08 +000010729 uniobj = PyUnicode_FromObject(obj);
10730 if (uniobj == NULL) {
10731 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010732 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010733 return 0;
10734 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010736 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010737 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010738 Py_DECREF(uniobj);
10739 return 0;
10740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010742 Py_DECREF(uniobj);
10743 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010744}
10745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010746PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010749Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010750done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751
10752static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010753unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010755 Py_ssize_t marg, left;
10756 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 Py_UCS4 fillchar = ' ';
10758
Victor Stinnere9a29352011-10-01 02:14:59 +020010759 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761
Benjamin Petersonbac79492012-01-14 13:34:47 -050010762 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763 return NULL;
10764
Victor Stinnerc4b49542011-12-11 22:44:26 +010010765 if (PyUnicode_GET_LENGTH(self) >= width)
10766 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767
Victor Stinnerc4b49542011-12-11 22:44:26 +010010768 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 left = marg / 2 + (marg & width & 1);
10770
Victor Stinner9310abb2011-10-05 00:59:23 +020010771 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772}
10773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774/* This function assumes that str1 and str2 are readied by the caller. */
10775
Marc-André Lemburge5034372000-08-08 08:04:29 +000010776static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010777unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 int kind1, kind2;
10780 void *data1, *data2;
10781 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 kind1 = PyUnicode_KIND(str1);
10784 kind2 = PyUnicode_KIND(str2);
10785 data1 = PyUnicode_DATA(str1);
10786 data2 = PyUnicode_DATA(str2);
10787 len1 = PyUnicode_GET_LENGTH(str1);
10788 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 for (i = 0; i < len1 && i < len2; ++i) {
10791 Py_UCS4 c1, c2;
10792 c1 = PyUnicode_READ(kind1, data1, i);
10793 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010794
10795 if (c1 != c2)
10796 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010797 }
10798
10799 return (len1 < len2) ? -1 : (len1 != len2);
10800}
10801
Alexander Belopolsky40018472011-02-26 01:02:56 +000010802int
10803PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10806 if (PyUnicode_READY(left) == -1 ||
10807 PyUnicode_READY(right) == -1)
10808 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010809 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010811 PyErr_Format(PyExc_TypeError,
10812 "Can't compare %.100s and %.100s",
10813 left->ob_type->tp_name,
10814 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815 return -1;
10816}
10817
Martin v. Löwis5b222132007-06-10 09:51:05 +000010818int
10819PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 Py_ssize_t i;
10822 int kind;
10823 void *data;
10824 Py_UCS4 chr;
10825
Victor Stinner910337b2011-10-03 03:20:16 +020010826 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 if (PyUnicode_READY(uni) == -1)
10828 return -1;
10829 kind = PyUnicode_KIND(uni);
10830 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010831 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10833 if (chr != str[i])
10834 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010835 /* This check keeps Python strings that end in '\0' from comparing equal
10836 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010838 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010839 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010840 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010841 return 0;
10842}
10843
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010844
Benjamin Peterson29060642009-01-31 22:14:21 +000010845#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010846 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010847
Alexander Belopolsky40018472011-02-26 01:02:56 +000010848PyObject *
10849PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010850{
10851 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010852
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010853 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10854 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (PyUnicode_READY(left) == -1 ||
10856 PyUnicode_READY(right) == -1)
10857 return NULL;
10858 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10859 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010860 if (op == Py_EQ) {
10861 Py_INCREF(Py_False);
10862 return Py_False;
10863 }
10864 if (op == Py_NE) {
10865 Py_INCREF(Py_True);
10866 return Py_True;
10867 }
10868 }
10869 if (left == right)
10870 result = 0;
10871 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010872 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010873
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010874 /* Convert the return value to a Boolean */
10875 switch (op) {
10876 case Py_EQ:
10877 v = TEST_COND(result == 0);
10878 break;
10879 case Py_NE:
10880 v = TEST_COND(result != 0);
10881 break;
10882 case Py_LE:
10883 v = TEST_COND(result <= 0);
10884 break;
10885 case Py_GE:
10886 v = TEST_COND(result >= 0);
10887 break;
10888 case Py_LT:
10889 v = TEST_COND(result == -1);
10890 break;
10891 case Py_GT:
10892 v = TEST_COND(result == 1);
10893 break;
10894 default:
10895 PyErr_BadArgument();
10896 return NULL;
10897 }
10898 Py_INCREF(v);
10899 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010900 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010901
Brian Curtindfc80e32011-08-10 20:28:54 -050010902 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010903}
10904
Alexander Belopolsky40018472011-02-26 01:02:56 +000010905int
10906PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010907{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010908 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 int kind1, kind2, kind;
10910 void *buf1, *buf2;
10911 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010912 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010913
10914 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010915 sub = PyUnicode_FromObject(element);
10916 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010917 PyErr_Format(PyExc_TypeError,
10918 "'in <string>' requires string as left operand, not %s",
10919 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010920 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010921 }
10922
Thomas Wouters477c8d52006-05-27 19:21:47 +000010923 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010924 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010925 Py_DECREF(sub);
10926 return -1;
10927 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010928 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10929 Py_DECREF(sub);
10930 Py_DECREF(str);
10931 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 kind1 = PyUnicode_KIND(str);
10934 kind2 = PyUnicode_KIND(sub);
10935 kind = kind1 > kind2 ? kind1 : kind2;
10936 buf1 = PyUnicode_DATA(str);
10937 buf2 = PyUnicode_DATA(sub);
10938 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010939 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 if (!buf1) {
10941 Py_DECREF(sub);
10942 return -1;
10943 }
10944 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010945 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 if (!buf2) {
10947 Py_DECREF(sub);
10948 if (kind1 != kind) PyMem_Free(buf1);
10949 return -1;
10950 }
10951 len1 = PyUnicode_GET_LENGTH(str);
10952 len2 = PyUnicode_GET_LENGTH(sub);
10953
Benjamin Petersonead6b532011-12-20 17:23:42 -060010954 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 case PyUnicode_1BYTE_KIND:
10956 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10957 break;
10958 case PyUnicode_2BYTE_KIND:
10959 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10960 break;
10961 case PyUnicode_4BYTE_KIND:
10962 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10963 break;
10964 default:
10965 result = -1;
10966 assert(0);
10967 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010968
10969 Py_DECREF(str);
10970 Py_DECREF(sub);
10971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 if (kind1 != kind)
10973 PyMem_Free(buf1);
10974 if (kind2 != kind)
10975 PyMem_Free(buf2);
10976
Guido van Rossum403d68b2000-03-13 15:55:09 +000010977 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010978}
10979
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980/* Concat to string or Unicode object giving a new Unicode object. */
10981
Alexander Belopolsky40018472011-02-26 01:02:56 +000010982PyObject *
10983PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010986 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010987 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988
10989 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996
10997 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010998 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011002 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 }
11006
Victor Stinner488fa492011-12-12 00:01:39 +010011007 u_len = PyUnicode_GET_LENGTH(u);
11008 v_len = PyUnicode_GET_LENGTH(v);
11009 if (u_len > PY_SSIZE_T_MAX - v_len) {
11010 PyErr_SetString(PyExc_OverflowError,
11011 "strings are too large to concat");
11012 goto onError;
11013 }
11014 new_len = u_len + v_len;
11015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011017 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11018 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011021 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011024 copy_characters(w, 0, u, 0, u_len);
11025 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 Py_DECREF(u);
11027 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011028 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030
Benjamin Peterson29060642009-01-31 22:14:21 +000011031 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032 Py_XDECREF(u);
11033 Py_XDECREF(v);
11034 return NULL;
11035}
11036
Walter Dörwald1ab83302007-05-18 17:15:44 +000011037void
Victor Stinner23e56682011-10-03 03:54:37 +020011038PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011039{
Victor Stinner23e56682011-10-03 03:54:37 +020011040 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011041 Py_UCS4 maxchar, maxchar2;
11042 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011043
11044 if (p_left == NULL) {
11045 if (!PyErr_Occurred())
11046 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011047 return;
11048 }
Victor Stinner23e56682011-10-03 03:54:37 +020011049 left = *p_left;
11050 if (right == NULL || !PyUnicode_Check(left)) {
11051 if (!PyErr_Occurred())
11052 PyErr_BadInternalCall();
11053 goto error;
11054 }
11055
Benjamin Petersonbac79492012-01-14 13:34:47 -050011056 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011057 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011058 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011059 goto error;
11060
Victor Stinner488fa492011-12-12 00:01:39 +010011061 /* Shortcuts */
11062 if (left == unicode_empty) {
11063 Py_DECREF(left);
11064 Py_INCREF(right);
11065 *p_left = right;
11066 return;
11067 }
11068 if (right == unicode_empty)
11069 return;
11070
11071 left_len = PyUnicode_GET_LENGTH(left);
11072 right_len = PyUnicode_GET_LENGTH(right);
11073 if (left_len > PY_SSIZE_T_MAX - right_len) {
11074 PyErr_SetString(PyExc_OverflowError,
11075 "strings are too large to concat");
11076 goto error;
11077 }
11078 new_len = left_len + right_len;
11079
11080 if (unicode_modifiable(left)
11081 && PyUnicode_CheckExact(right)
11082 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011083 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11084 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011085 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011086 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011087 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11088 {
11089 /* append inplace */
11090 if (unicode_resize(p_left, new_len) != 0) {
11091 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11092 * deallocated so it cannot be put back into
11093 * 'variable'. The MemoryError is raised when there
11094 * is no value in 'variable', which might (very
11095 * remotely) be a cause of incompatibilities.
11096 */
11097 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011098 }
Victor Stinner488fa492011-12-12 00:01:39 +010011099 /* copy 'right' into the newly allocated area of 'left' */
11100 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011101 }
Victor Stinner488fa492011-12-12 00:01:39 +010011102 else {
11103 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11104 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11105 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011106
Victor Stinner488fa492011-12-12 00:01:39 +010011107 /* Concat the two Unicode strings */
11108 res = PyUnicode_New(new_len, maxchar);
11109 if (res == NULL)
11110 goto error;
11111 copy_characters(res, 0, left, 0, left_len);
11112 copy_characters(res, left_len, right, 0, right_len);
11113 Py_DECREF(left);
11114 *p_left = res;
11115 }
11116 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011117 return;
11118
11119error:
Victor Stinner488fa492011-12-12 00:01:39 +010011120 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011121}
11122
11123void
11124PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11125{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011126 PyUnicode_Append(pleft, right);
11127 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011128}
11129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011130PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011131 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011133Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011134string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011135interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
11137static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011138unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011140 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011141 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011142 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 int kind1, kind2, kind;
11145 void *buf1, *buf2;
11146 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
Jesus Ceaac451502011-04-20 17:09:23 +020011148 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11149 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 kind1 = PyUnicode_KIND(self);
11153 kind2 = PyUnicode_KIND(substring);
11154 kind = kind1 > kind2 ? kind1 : kind2;
11155 buf1 = PyUnicode_DATA(self);
11156 buf2 = PyUnicode_DATA(substring);
11157 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011158 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if (!buf1) {
11160 Py_DECREF(substring);
11161 return NULL;
11162 }
11163 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011164 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 if (!buf2) {
11166 Py_DECREF(substring);
11167 if (kind1 != kind) PyMem_Free(buf1);
11168 return NULL;
11169 }
11170 len1 = PyUnicode_GET_LENGTH(self);
11171 len2 = PyUnicode_GET_LENGTH(substring);
11172
11173 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011174 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 case PyUnicode_1BYTE_KIND:
11176 iresult = ucs1lib_count(
11177 ((Py_UCS1*)buf1) + start, end - start,
11178 buf2, len2, PY_SSIZE_T_MAX
11179 );
11180 break;
11181 case PyUnicode_2BYTE_KIND:
11182 iresult = ucs2lib_count(
11183 ((Py_UCS2*)buf1) + start, end - start,
11184 buf2, len2, PY_SSIZE_T_MAX
11185 );
11186 break;
11187 case PyUnicode_4BYTE_KIND:
11188 iresult = ucs4lib_count(
11189 ((Py_UCS4*)buf1) + start, end - start,
11190 buf2, len2, PY_SSIZE_T_MAX
11191 );
11192 break;
11193 default:
11194 assert(0); iresult = 0;
11195 }
11196
11197 result = PyLong_FromSsize_t(iresult);
11198
11199 if (kind1 != kind)
11200 PyMem_Free(buf1);
11201 if (kind2 != kind)
11202 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
11204 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011205
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 return result;
11207}
11208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011209PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011210 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011212Encode S using the codec registered for encoding. Default encoding\n\
11213is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011214handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011215a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11216'xmlcharrefreplace' as well as any other name registered with\n\
11217codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218
11219static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011220unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011222 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 char *encoding = NULL;
11224 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011225
Benjamin Peterson308d6372009-09-18 21:42:35 +000011226 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11227 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011229 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011230}
11231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011232PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011233 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234\n\
11235Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
11238static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011239unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011241 Py_ssize_t i, j, line_pos, src_len, incr;
11242 Py_UCS4 ch;
11243 PyObject *u;
11244 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011246 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011247 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
11249 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
Antoine Pitrou22425222011-10-04 19:10:51 +020011252 if (PyUnicode_READY(self) == -1)
11253 return NULL;
11254
Thomas Wouters7e474022000-07-16 12:04:32 +000011255 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011256 src_len = PyUnicode_GET_LENGTH(self);
11257 i = j = line_pos = 0;
11258 kind = PyUnicode_KIND(self);
11259 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011260 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011261 for (; i < src_len; i++) {
11262 ch = PyUnicode_READ(kind, src_data, i);
11263 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011264 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011266 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011268 goto overflow;
11269 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011271 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011275 goto overflow;
11276 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011278 if (ch == '\n' || ch == '\r')
11279 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011281 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011282 if (!found)
11283 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011284
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011286 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287 if (!u)
11288 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011289 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290
Antoine Pitroue71d5742011-10-04 15:55:09 +020011291 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 for (; i < src_len; i++) {
11294 ch = PyUnicode_READ(kind, src_data, i);
11295 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 incr = tabsize - (line_pos % tabsize);
11298 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011299 FILL(kind, dest_data, ' ', j, incr);
11300 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011302 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011304 line_pos++;
11305 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011306 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011307 if (ch == '\n' || ch == '\r')
11308 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011310 }
11311 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011312 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011313
Antoine Pitroue71d5742011-10-04 15:55:09 +020011314 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011315 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317}
11318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011319PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011320 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321\n\
11322Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011323such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324arguments start and end are interpreted as in slice notation.\n\
11325\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011326Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327
11328static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011331 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011332 Py_ssize_t start;
11333 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011334 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335
Jesus Ceaac451502011-04-20 17:09:23 +020011336 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11337 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 if (PyUnicode_READY(self) == -1)
11341 return NULL;
11342 if (PyUnicode_READY(substring) == -1)
11343 return NULL;
11344
Victor Stinner7931d9a2011-11-04 00:22:48 +010011345 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
11347 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 if (result == -2)
11350 return NULL;
11351
Christian Heimes217cfd12007-12-02 14:31:20 +000011352 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353}
11354
11355static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011356unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011358 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11359 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362}
11363
Guido van Rossumc2504932007-09-18 19:42:40 +000011364/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011365 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011366static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011367unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368{
Guido van Rossumc2504932007-09-18 19:42:40 +000011369 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011370 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011371
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011372#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011373 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011374#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 if (_PyUnicode_HASH(self) != -1)
11376 return _PyUnicode_HASH(self);
11377 if (PyUnicode_READY(self) == -1)
11378 return -1;
11379 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011380 /*
11381 We make the hash of the empty string be 0, rather than using
11382 (prefix ^ suffix), since this slightly obfuscates the hash secret
11383 */
11384 if (len == 0) {
11385 _PyUnicode_HASH(self) = 0;
11386 return 0;
11387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388
11389 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011390#define HASH(P) \
11391 x ^= (Py_uhash_t) *P << 7; \
11392 while (--len >= 0) \
11393 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394
Georg Brandl2fb477c2012-02-21 00:33:36 +010011395 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 switch (PyUnicode_KIND(self)) {
11397 case PyUnicode_1BYTE_KIND: {
11398 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11399 HASH(c);
11400 break;
11401 }
11402 case PyUnicode_2BYTE_KIND: {
11403 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11404 HASH(s);
11405 break;
11406 }
11407 default: {
11408 Py_UCS4 *l;
11409 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11410 "Impossible switch case in unicode_hash");
11411 l = PyUnicode_4BYTE_DATA(self);
11412 HASH(l);
11413 break;
11414 }
11415 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011416 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11417 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418
Guido van Rossumc2504932007-09-18 19:42:40 +000011419 if (x == -1)
11420 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011422 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011427 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011434 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011435 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011436 Py_ssize_t start;
11437 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
Jesus Ceaac451502011-04-20 17:09:23 +020011439 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11440 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 if (PyUnicode_READY(self) == -1)
11444 return NULL;
11445 if (PyUnicode_READY(substring) == -1)
11446 return NULL;
11447
Victor Stinner7931d9a2011-11-04 00:22:48 +010011448 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
11450 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 if (result == -2)
11453 return NULL;
11454
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 if (result < 0) {
11456 PyErr_SetString(PyExc_ValueError, "substring not found");
11457 return NULL;
11458 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011459
Christian Heimes217cfd12007-12-02 14:31:20 +000011460 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461}
11462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011463PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011466Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011467at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
11469static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011470unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 Py_ssize_t i, length;
11473 int kind;
11474 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 int cased;
11476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 if (PyUnicode_READY(self) == -1)
11478 return NULL;
11479 length = PyUnicode_GET_LENGTH(self);
11480 kind = PyUnicode_KIND(self);
11481 data = PyUnicode_DATA(self);
11482
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 if (length == 1)
11485 return PyBool_FromLong(
11486 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011488 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011491
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 for (i = 0; i < length; i++) {
11494 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011495
Benjamin Peterson29060642009-01-31 22:14:21 +000011496 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11497 return PyBool_FromLong(0);
11498 else if (!cased && Py_UNICODE_ISLOWER(ch))
11499 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011501 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502}
11503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011504PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011507Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011508at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
11510static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011511unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 Py_ssize_t i, length;
11514 int kind;
11515 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516 int cased;
11517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 if (PyUnicode_READY(self) == -1)
11519 return NULL;
11520 length = PyUnicode_GET_LENGTH(self);
11521 kind = PyUnicode_KIND(self);
11522 data = PyUnicode_DATA(self);
11523
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 if (length == 1)
11526 return PyBool_FromLong(
11527 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011529 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011532
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 for (i = 0; i < length; i++) {
11535 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011536
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11538 return PyBool_FromLong(0);
11539 else if (!cased && Py_UNICODE_ISUPPER(ch))
11540 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011542 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543}
11544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011545PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011548Return True if S is a titlecased string and there is at least one\n\
11549character in S, i.e. upper- and titlecase characters may only\n\
11550follow uncased characters and lowercase characters only cased ones.\n\
11551Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
11553static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011554unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 Py_ssize_t i, length;
11557 int kind;
11558 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 int cased, previous_is_cased;
11560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 if (PyUnicode_READY(self) == -1)
11562 return NULL;
11563 length = PyUnicode_GET_LENGTH(self);
11564 kind = PyUnicode_KIND(self);
11565 data = PyUnicode_DATA(self);
11566
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 if (length == 1) {
11569 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11570 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11571 (Py_UNICODE_ISUPPER(ch) != 0));
11572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011574 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011577
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578 cased = 0;
11579 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 for (i = 0; i < length; i++) {
11581 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011582
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11584 if (previous_is_cased)
11585 return PyBool_FromLong(0);
11586 previous_is_cased = 1;
11587 cased = 1;
11588 }
11589 else if (Py_UNICODE_ISLOWER(ch)) {
11590 if (!previous_is_cased)
11591 return PyBool_FromLong(0);
11592 previous_is_cased = 1;
11593 cased = 1;
11594 }
11595 else
11596 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011598 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599}
11600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011601PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011604Return True if all characters in S are whitespace\n\
11605and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606
11607static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011608unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 Py_ssize_t i, length;
11611 int kind;
11612 void *data;
11613
11614 if (PyUnicode_READY(self) == -1)
11615 return NULL;
11616 length = PyUnicode_GET_LENGTH(self);
11617 kind = PyUnicode_KIND(self);
11618 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 if (length == 1)
11622 return PyBool_FromLong(
11623 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011625 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629 for (i = 0; i < length; i++) {
11630 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011631 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011634 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635}
11636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011637PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011639\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011640Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011641and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011642
11643static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011644unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011645{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 Py_ssize_t i, length;
11647 int kind;
11648 void *data;
11649
11650 if (PyUnicode_READY(self) == -1)
11651 return NULL;
11652 length = PyUnicode_GET_LENGTH(self);
11653 kind = PyUnicode_KIND(self);
11654 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011655
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011656 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 if (length == 1)
11658 return PyBool_FromLong(
11659 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011660
11661 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 for (i = 0; i < length; i++) {
11666 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011668 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011669 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011670}
11671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011672PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011673 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011674\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011675Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011676and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011677
11678static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011679unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 int kind;
11682 void *data;
11683 Py_ssize_t len, i;
11684
11685 if (PyUnicode_READY(self) == -1)
11686 return NULL;
11687
11688 kind = PyUnicode_KIND(self);
11689 data = PyUnicode_DATA(self);
11690 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011691
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011692 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 if (len == 1) {
11694 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11695 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11696 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011697
11698 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 for (i = 0; i < len; i++) {
11703 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011704 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011705 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011706 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011707 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011708}
11709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011710PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011713Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011714False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715
11716static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011717unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 Py_ssize_t i, length;
11720 int kind;
11721 void *data;
11722
11723 if (PyUnicode_READY(self) == -1)
11724 return NULL;
11725 length = PyUnicode_GET_LENGTH(self);
11726 kind = PyUnicode_KIND(self);
11727 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 if (length == 1)
11731 return PyBool_FromLong(
11732 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011734 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 for (i = 0; i < length; i++) {
11739 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011742 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743}
11744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011745PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011748Return True if all characters in S are digits\n\
11749and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750
11751static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011752unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 Py_ssize_t i, length;
11755 int kind;
11756 void *data;
11757
11758 if (PyUnicode_READY(self) == -1)
11759 return NULL;
11760 length = PyUnicode_GET_LENGTH(self);
11761 kind = PyUnicode_KIND(self);
11762 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 if (length == 1) {
11766 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11767 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011770 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 for (i = 0; i < length; i++) {
11775 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011778 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779}
11780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011781PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011784Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011785False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
11787static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011788unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 Py_ssize_t i, length;
11791 int kind;
11792 void *data;
11793
11794 if (PyUnicode_READY(self) == -1)
11795 return NULL;
11796 length = PyUnicode_GET_LENGTH(self);
11797 kind = PyUnicode_KIND(self);
11798 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 if (length == 1)
11802 return PyBool_FromLong(
11803 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011805 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 for (i = 0; i < length; i++) {
11810 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011813 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814}
11815
Martin v. Löwis47383402007-08-15 07:32:56 +000011816int
11817PyUnicode_IsIdentifier(PyObject *self)
11818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 int kind;
11820 void *data;
11821 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011822 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 if (PyUnicode_READY(self) == -1) {
11825 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 }
11828
11829 /* Special case for empty strings */
11830 if (PyUnicode_GET_LENGTH(self) == 0)
11831 return 0;
11832 kind = PyUnicode_KIND(self);
11833 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011834
11835 /* PEP 3131 says that the first character must be in
11836 XID_Start and subsequent characters in XID_Continue,
11837 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011838 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011839 letters, digits, underscore). However, given the current
11840 definition of XID_Start and XID_Continue, it is sufficient
11841 to check just for these, except that _ must be allowed
11842 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011844 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011845 return 0;
11846
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011847 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011850 return 1;
11851}
11852
11853PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011855\n\
11856Return True if S is a valid identifier according\n\
11857to the language definition.");
11858
11859static PyObject*
11860unicode_isidentifier(PyObject *self)
11861{
11862 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11863}
11864
Georg Brandl559e5d72008-06-11 18:37:52 +000011865PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011866 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011867\n\
11868Return True if all characters in S are considered\n\
11869printable in repr() or S is empty, False otherwise.");
11870
11871static PyObject*
11872unicode_isprintable(PyObject *self)
11873{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 Py_ssize_t i, length;
11875 int kind;
11876 void *data;
11877
11878 if (PyUnicode_READY(self) == -1)
11879 return NULL;
11880 length = PyUnicode_GET_LENGTH(self);
11881 kind = PyUnicode_KIND(self);
11882 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011883
11884 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 if (length == 1)
11886 return PyBool_FromLong(
11887 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 for (i = 0; i < length; i++) {
11890 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011891 Py_RETURN_FALSE;
11892 }
11893 }
11894 Py_RETURN_TRUE;
11895}
11896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011897PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011898 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899\n\
11900Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011901iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902
11903static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011904unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011906 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907}
11908
Martin v. Löwis18e16552006-02-15 17:27:45 +000011909static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011910unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 if (PyUnicode_READY(self) == -1)
11913 return -1;
11914 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915}
11916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011917PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011920Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011921done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922
11923static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011924unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011926 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 Py_UCS4 fillchar = ' ';
11928
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011929 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930 return NULL;
11931
Benjamin Petersonbac79492012-01-14 13:34:47 -050011932 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
Victor Stinnerc4b49542011-12-11 22:44:26 +010011935 if (PyUnicode_GET_LENGTH(self) >= width)
11936 return unicode_result_unchanged(self);
11937
11938 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939}
11940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011941PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011944Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945
11946static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011947unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011949 if (PyUnicode_READY(self) == -1)
11950 return NULL;
11951 if (PyUnicode_IS_ASCII(self))
11952 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011953 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954}
11955
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011956#define LEFTSTRIP 0
11957#define RIGHTSTRIP 1
11958#define BOTHSTRIP 2
11959
11960/* Arrays indexed by above */
11961static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11962
11963#define STRIPNAME(i) (stripformat[i]+3)
11964
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011965/* externally visible for str.strip(unicode) */
11966PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011967_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 void *data;
11970 int kind;
11971 Py_ssize_t i, j, len;
11972 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11975 return NULL;
11976
11977 kind = PyUnicode_KIND(self);
11978 data = PyUnicode_DATA(self);
11979 len = PyUnicode_GET_LENGTH(self);
11980 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11981 PyUnicode_DATA(sepobj),
11982 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011983
Benjamin Peterson14339b62009-01-31 16:36:08 +000011984 i = 0;
11985 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 while (i < len &&
11987 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011988 i++;
11989 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011990 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011991
Benjamin Peterson14339b62009-01-31 16:36:08 +000011992 j = len;
11993 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 do {
11995 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 } while (j >= i &&
11997 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011999 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012000
Victor Stinner7931d9a2011-11-04 00:22:48 +010012001 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002}
12003
12004PyObject*
12005PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12006{
12007 unsigned char *data;
12008 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012009 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010
Victor Stinnerde636f32011-10-01 03:55:54 +020012011 if (PyUnicode_READY(self) == -1)
12012 return NULL;
12013
12014 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
12015
Victor Stinner12bab6d2011-10-01 01:53:49 +020012016 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010012017 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018
Victor Stinner12bab6d2011-10-01 01:53:49 +020012019 length = end - start;
12020 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012021 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022
Victor Stinnerde636f32011-10-01 03:55:54 +020012023 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012024 PyErr_SetString(PyExc_IndexError, "string index out of range");
12025 return NULL;
12026 }
12027
Victor Stinnerb9275c12011-10-05 14:01:42 +020012028 if (PyUnicode_IS_ASCII(self)) {
12029 kind = PyUnicode_KIND(self);
12030 data = PyUnicode_1BYTE_DATA(self);
12031 return unicode_fromascii(data + start, length);
12032 }
12033 else {
12034 kind = PyUnicode_KIND(self);
12035 data = PyUnicode_1BYTE_DATA(self);
12036 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012037 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012038 length);
12039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
12042static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012043do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 int kind;
12046 void *data;
12047 Py_ssize_t len, i, j;
12048
12049 if (PyUnicode_READY(self) == -1)
12050 return NULL;
12051
12052 kind = PyUnicode_KIND(self);
12053 data = PyUnicode_DATA(self);
12054 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012055
Benjamin Peterson14339b62009-01-31 16:36:08 +000012056 i = 0;
12057 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012059 i++;
12060 }
12061 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012062
Benjamin Peterson14339b62009-01-31 16:36:08 +000012063 j = len;
12064 if (striptype != LEFTSTRIP) {
12065 do {
12066 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012068 j++;
12069 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012070
Victor Stinner7931d9a2011-11-04 00:22:48 +010012071 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072}
12073
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012074
12075static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012076do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012077{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012078 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012079
Benjamin Peterson14339b62009-01-31 16:36:08 +000012080 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12081 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012082
Benjamin Peterson14339b62009-01-31 16:36:08 +000012083 if (sep != NULL && sep != Py_None) {
12084 if (PyUnicode_Check(sep))
12085 return _PyUnicode_XStrip(self, striptype, sep);
12086 else {
12087 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012088 "%s arg must be None or str",
12089 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012090 return NULL;
12091 }
12092 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012093
Benjamin Peterson14339b62009-01-31 16:36:08 +000012094 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012095}
12096
12097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012098PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012100\n\
12101Return a copy of the string S with leading and trailing\n\
12102whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012103If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012104
12105static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012106unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012107{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012108 if (PyTuple_GET_SIZE(args) == 0)
12109 return do_strip(self, BOTHSTRIP); /* Common case */
12110 else
12111 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012112}
12113
12114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012115PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012117\n\
12118Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012119If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012120
12121static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012122unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012123{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012124 if (PyTuple_GET_SIZE(args) == 0)
12125 return do_strip(self, LEFTSTRIP); /* Common case */
12126 else
12127 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128}
12129
12130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012131PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012132 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133\n\
12134Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012135If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012136
12137static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012138unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 if (PyTuple_GET_SIZE(args) == 0)
12141 return do_strip(self, RIGHTSTRIP); /* Common case */
12142 else
12143 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012144}
12145
12146
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012148unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012150 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152
Georg Brandl222de0f2009-04-12 12:01:50 +000012153 if (len < 1) {
12154 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012155 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157
Victor Stinnerc4b49542011-12-11 22:44:26 +010012158 /* no repeat, return original string */
12159 if (len == 1)
12160 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012161
Benjamin Petersonbac79492012-01-14 13:34:47 -050012162 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 return NULL;
12164
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012165 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012166 PyErr_SetString(PyExc_OverflowError,
12167 "repeated string is too long");
12168 return NULL;
12169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012171
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012172 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173 if (!u)
12174 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012175 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 if (PyUnicode_GET_LENGTH(str) == 1) {
12178 const int kind = PyUnicode_KIND(str);
12179 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012180 if (kind == PyUnicode_1BYTE_KIND) {
12181 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012182 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012183 }
12184 else if (kind == PyUnicode_2BYTE_KIND) {
12185 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012186 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012187 ucs2[n] = fill_char;
12188 } else {
12189 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12190 assert(kind == PyUnicode_4BYTE_KIND);
12191 for (n = 0; n < len; ++n)
12192 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 }
12195 else {
12196 /* number of characters copied this far */
12197 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012198 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 char *to = (char *) PyUnicode_DATA(u);
12200 Py_MEMCPY(to, PyUnicode_DATA(str),
12201 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012202 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 n = (done <= nchars-done) ? done : nchars-done;
12204 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012205 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207 }
12208
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012209 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012210 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211}
12212
Alexander Belopolsky40018472011-02-26 01:02:56 +000012213PyObject *
12214PyUnicode_Replace(PyObject *obj,
12215 PyObject *subobj,
12216 PyObject *replobj,
12217 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218{
12219 PyObject *self;
12220 PyObject *str1;
12221 PyObject *str2;
12222 PyObject *result;
12223
12224 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012225 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012228 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 Py_DECREF(self);
12230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 }
12232 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012233 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 Py_DECREF(self);
12235 Py_DECREF(str1);
12236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012238 if (PyUnicode_READY(self) == -1 ||
12239 PyUnicode_READY(str1) == -1 ||
12240 PyUnicode_READY(str2) == -1)
12241 result = NULL;
12242 else
12243 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 Py_DECREF(self);
12245 Py_DECREF(str1);
12246 Py_DECREF(str2);
12247 return result;
12248}
12249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012250PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012251 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252\n\
12253Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012254old replaced by new. If the optional argument count is\n\
12255given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256
12257static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 PyObject *str1;
12261 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012262 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 PyObject *result;
12264
Martin v. Löwis18e16552006-02-15 17:27:45 +000012265 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012267 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012268 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012270 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 return NULL;
12272 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012273 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 Py_DECREF(str1);
12275 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012276 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012277 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12278 result = NULL;
12279 else
12280 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281
12282 Py_DECREF(str1);
12283 Py_DECREF(str2);
12284 return result;
12285}
12286
Alexander Belopolsky40018472011-02-26 01:02:56 +000012287static PyObject *
12288unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012290 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 Py_ssize_t isize;
12292 Py_ssize_t osize, squote, dquote, i, o;
12293 Py_UCS4 max, quote;
12294 int ikind, okind;
12295 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012298 return NULL;
12299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 isize = PyUnicode_GET_LENGTH(unicode);
12301 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 /* Compute length of output, quote characters, and
12304 maximum character */
12305 osize = 2; /* quotes */
12306 max = 127;
12307 squote = dquote = 0;
12308 ikind = PyUnicode_KIND(unicode);
12309 for (i = 0; i < isize; i++) {
12310 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12311 switch (ch) {
12312 case '\'': squote++; osize++; break;
12313 case '"': dquote++; osize++; break;
12314 case '\\': case '\t': case '\r': case '\n':
12315 osize += 2; break;
12316 default:
12317 /* Fast-path ASCII */
12318 if (ch < ' ' || ch == 0x7f)
12319 osize += 4; /* \xHH */
12320 else if (ch < 0x7f)
12321 osize++;
12322 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12323 osize++;
12324 max = ch > max ? ch : max;
12325 }
12326 else if (ch < 0x100)
12327 osize += 4; /* \xHH */
12328 else if (ch < 0x10000)
12329 osize += 6; /* \uHHHH */
12330 else
12331 osize += 10; /* \uHHHHHHHH */
12332 }
12333 }
12334
12335 quote = '\'';
12336 if (squote) {
12337 if (dquote)
12338 /* Both squote and dquote present. Use squote,
12339 and escape them */
12340 osize += squote;
12341 else
12342 quote = '"';
12343 }
12344
12345 repr = PyUnicode_New(osize, max);
12346 if (repr == NULL)
12347 return NULL;
12348 okind = PyUnicode_KIND(repr);
12349 odata = PyUnicode_DATA(repr);
12350
12351 PyUnicode_WRITE(okind, odata, 0, quote);
12352 PyUnicode_WRITE(okind, odata, osize-1, quote);
12353
12354 for (i = 0, o = 1; i < isize; i++) {
12355 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012356
12357 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 if ((ch == quote) || (ch == '\\')) {
12359 PyUnicode_WRITE(okind, odata, o++, '\\');
12360 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012361 continue;
12362 }
12363
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012365 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 PyUnicode_WRITE(okind, odata, o++, '\\');
12367 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012368 }
12369 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 PyUnicode_WRITE(okind, odata, o++, '\\');
12371 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012372 }
12373 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 PyUnicode_WRITE(okind, odata, o++, '\\');
12375 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012376 }
12377
12378 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012379 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 PyUnicode_WRITE(okind, odata, o++, '\\');
12381 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012382 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12383 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012384 }
12385
Georg Brandl559e5d72008-06-11 18:37:52 +000012386 /* Copy ASCII characters as-is */
12387 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012389 }
12390
Benjamin Peterson29060642009-01-31 22:14:21 +000012391 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012392 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012393 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012394 (categories Z* and C* except ASCII space)
12395 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012397 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 if (ch <= 0xff) {
12399 PyUnicode_WRITE(okind, odata, o++, '\\');
12400 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012401 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12402 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012403 }
12404 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 else if (ch >= 0x10000) {
12406 PyUnicode_WRITE(okind, odata, o++, '\\');
12407 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012408 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12409 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12410 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12411 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12412 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12413 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12414 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12415 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012416 }
12417 /* Map 16-bit characters to '\uxxxx' */
12418 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 PyUnicode_WRITE(okind, odata, o++, '\\');
12420 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012421 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12422 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12423 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12424 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012425 }
12426 }
12427 /* Copy characters as-is */
12428 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012430 }
12431 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012434 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012435 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436}
12437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012438PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012439 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440\n\
12441Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012442such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443arguments start and end are interpreted as in slice notation.\n\
12444\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012445Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446
12447static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012450 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012451 Py_ssize_t start;
12452 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454
Jesus Ceaac451502011-04-20 17:09:23 +020012455 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12456 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 if (PyUnicode_READY(self) == -1)
12460 return NULL;
12461 if (PyUnicode_READY(substring) == -1)
12462 return NULL;
12463
Victor Stinner7931d9a2011-11-04 00:22:48 +010012464 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465
12466 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 if (result == -2)
12469 return NULL;
12470
Christian Heimes217cfd12007-12-02 14:31:20 +000012471 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472}
12473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012474PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012477Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478
12479static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012482 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012483 Py_ssize_t start;
12484 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012485 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486
Jesus Ceaac451502011-04-20 17:09:23 +020012487 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12488 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491 if (PyUnicode_READY(self) == -1)
12492 return NULL;
12493 if (PyUnicode_READY(substring) == -1)
12494 return NULL;
12495
Victor Stinner7931d9a2011-11-04 00:22:48 +010012496 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497
12498 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 if (result == -2)
12501 return NULL;
12502
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503 if (result < 0) {
12504 PyErr_SetString(PyExc_ValueError, "substring not found");
12505 return NULL;
12506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507
Christian Heimes217cfd12007-12-02 14:31:20 +000012508 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509}
12510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012511PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012514Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012515done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516
12517static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012518unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012520 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 Py_UCS4 fillchar = ' ';
12522
Victor Stinnere9a29352011-10-01 02:14:59 +020012523 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012525
Benjamin Petersonbac79492012-01-14 13:34:47 -050012526 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527 return NULL;
12528
Victor Stinnerc4b49542011-12-11 22:44:26 +010012529 if (PyUnicode_GET_LENGTH(self) >= width)
12530 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531
Victor Stinnerc4b49542011-12-11 22:44:26 +010012532 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533}
12534
Alexander Belopolsky40018472011-02-26 01:02:56 +000012535PyObject *
12536PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537{
12538 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012539
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540 s = PyUnicode_FromObject(s);
12541 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012542 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012543 if (sep != NULL) {
12544 sep = PyUnicode_FromObject(sep);
12545 if (sep == NULL) {
12546 Py_DECREF(s);
12547 return NULL;
12548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549 }
12550
Victor Stinner9310abb2011-10-05 00:59:23 +020012551 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552
12553 Py_DECREF(s);
12554 Py_XDECREF(sep);
12555 return result;
12556}
12557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012558PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012559 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560\n\
12561Return a list of the words in S, using sep as the\n\
12562delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012563splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012564whitespace string is a separator and empty strings are\n\
12565removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566
12567static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012568unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012570 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012572 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012574 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12575 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576 return NULL;
12577
12578 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012579 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012581 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012583 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584}
12585
Thomas Wouters477c8d52006-05-27 19:21:47 +000012586PyObject *
12587PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12588{
12589 PyObject* str_obj;
12590 PyObject* sep_obj;
12591 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 int kind1, kind2, kind;
12593 void *buf1 = NULL, *buf2 = NULL;
12594 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012595
12596 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012597 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012598 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012599 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012600 if (!sep_obj) {
12601 Py_DECREF(str_obj);
12602 return NULL;
12603 }
12604 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12605 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012606 Py_DECREF(str_obj);
12607 return NULL;
12608 }
12609
Victor Stinner14f8f022011-10-05 20:58:25 +020012610 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012612 kind = Py_MAX(kind1, kind2);
12613 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012615 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 if (!buf1)
12617 goto onError;
12618 buf2 = PyUnicode_DATA(sep_obj);
12619 if (kind2 != kind)
12620 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12621 if (!buf2)
12622 goto onError;
12623 len1 = PyUnicode_GET_LENGTH(str_obj);
12624 len2 = PyUnicode_GET_LENGTH(sep_obj);
12625
Benjamin Petersonead6b532011-12-20 17:23:42 -060012626 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012628 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12629 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12630 else
12631 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 break;
12633 case PyUnicode_2BYTE_KIND:
12634 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12635 break;
12636 case PyUnicode_4BYTE_KIND:
12637 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12638 break;
12639 default:
12640 assert(0);
12641 out = 0;
12642 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012643
12644 Py_DECREF(sep_obj);
12645 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 if (kind1 != kind)
12647 PyMem_Free(buf1);
12648 if (kind2 != kind)
12649 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012650
12651 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 onError:
12653 Py_DECREF(sep_obj);
12654 Py_DECREF(str_obj);
12655 if (kind1 != kind && buf1)
12656 PyMem_Free(buf1);
12657 if (kind2 != kind && buf2)
12658 PyMem_Free(buf2);
12659 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012660}
12661
12662
12663PyObject *
12664PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12665{
12666 PyObject* str_obj;
12667 PyObject* sep_obj;
12668 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 int kind1, kind2, kind;
12670 void *buf1 = NULL, *buf2 = NULL;
12671 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012672
12673 str_obj = PyUnicode_FromObject(str_in);
12674 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012675 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012676 sep_obj = PyUnicode_FromObject(sep_in);
12677 if (!sep_obj) {
12678 Py_DECREF(str_obj);
12679 return NULL;
12680 }
12681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 kind1 = PyUnicode_KIND(str_in);
12683 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012684 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 buf1 = PyUnicode_DATA(str_in);
12686 if (kind1 != kind)
12687 buf1 = _PyUnicode_AsKind(str_in, kind);
12688 if (!buf1)
12689 goto onError;
12690 buf2 = PyUnicode_DATA(sep_obj);
12691 if (kind2 != kind)
12692 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12693 if (!buf2)
12694 goto onError;
12695 len1 = PyUnicode_GET_LENGTH(str_obj);
12696 len2 = PyUnicode_GET_LENGTH(sep_obj);
12697
Benjamin Petersonead6b532011-12-20 17:23:42 -060012698 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012700 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12701 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12702 else
12703 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 break;
12705 case PyUnicode_2BYTE_KIND:
12706 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12707 break;
12708 case PyUnicode_4BYTE_KIND:
12709 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12710 break;
12711 default:
12712 assert(0);
12713 out = 0;
12714 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012715
12716 Py_DECREF(sep_obj);
12717 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 if (kind1 != kind)
12719 PyMem_Free(buf1);
12720 if (kind2 != kind)
12721 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012722
12723 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 onError:
12725 Py_DECREF(sep_obj);
12726 Py_DECREF(str_obj);
12727 if (kind1 != kind && buf1)
12728 PyMem_Free(buf1);
12729 if (kind2 != kind && buf2)
12730 PyMem_Free(buf2);
12731 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012732}
12733
12734PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012735 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012736\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012737Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012738the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012739found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012740
12741static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012742unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012743{
Victor Stinner9310abb2011-10-05 00:59:23 +020012744 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012745}
12746
12747PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012748 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012749\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012750Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012751the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012752separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012753
12754static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012755unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012756{
Victor Stinner9310abb2011-10-05 00:59:23 +020012757 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012758}
12759
Alexander Belopolsky40018472011-02-26 01:02:56 +000012760PyObject *
12761PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012762{
12763 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012764
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012765 s = PyUnicode_FromObject(s);
12766 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012767 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012768 if (sep != NULL) {
12769 sep = PyUnicode_FromObject(sep);
12770 if (sep == NULL) {
12771 Py_DECREF(s);
12772 return NULL;
12773 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012774 }
12775
Victor Stinner9310abb2011-10-05 00:59:23 +020012776 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012777
12778 Py_DECREF(s);
12779 Py_XDECREF(sep);
12780 return result;
12781}
12782
12783PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012784 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012785\n\
12786Return a list of the words in S, using sep as the\n\
12787delimiter string, starting at the end of the string and\n\
12788working to the front. If maxsplit is given, at most maxsplit\n\
12789splits are done. If sep is not specified, any whitespace string\n\
12790is a separator.");
12791
12792static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012793unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012794{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012795 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012796 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012797 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012798
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012799 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12800 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012801 return NULL;
12802
12803 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012804 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012805 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012806 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012807 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012808 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012809}
12810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012811PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012812 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813\n\
12814Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012815Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012816is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
12818static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012819unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012821 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012822 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012824 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12825 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826 return NULL;
12827
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012828 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829}
12830
12831static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012832PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012834 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835}
12836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012837PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839\n\
12840Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012841and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842
12843static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012844unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012846 if (PyUnicode_READY(self) == -1)
12847 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012848 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849}
12850
Georg Brandlceee0772007-11-27 23:48:05 +000012851PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012852 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012853\n\
12854Return a translation table usable for str.translate().\n\
12855If there is only one argument, it must be a dictionary mapping Unicode\n\
12856ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012857Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012858If there are two arguments, they must be strings of equal length, and\n\
12859in the resulting dictionary, each character in x will be mapped to the\n\
12860character at the same position in y. If there is a third argument, it\n\
12861must be a string, whose characters will be mapped to None in the result.");
12862
12863static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012864unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012865{
12866 PyObject *x, *y = NULL, *z = NULL;
12867 PyObject *new = NULL, *key, *value;
12868 Py_ssize_t i = 0;
12869 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012870
Georg Brandlceee0772007-11-27 23:48:05 +000012871 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12872 return NULL;
12873 new = PyDict_New();
12874 if (!new)
12875 return NULL;
12876 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 int x_kind, y_kind, z_kind;
12878 void *x_data, *y_data, *z_data;
12879
Georg Brandlceee0772007-11-27 23:48:05 +000012880 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012881 if (!PyUnicode_Check(x)) {
12882 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12883 "be a string if there is a second argument");
12884 goto err;
12885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012887 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12888 "arguments must have equal length");
12889 goto err;
12890 }
12891 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 x_kind = PyUnicode_KIND(x);
12893 y_kind = PyUnicode_KIND(y);
12894 x_data = PyUnicode_DATA(x);
12895 y_data = PyUnicode_DATA(y);
12896 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12897 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012898 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012899 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012900 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012901 if (!value) {
12902 Py_DECREF(key);
12903 goto err;
12904 }
Georg Brandlceee0772007-11-27 23:48:05 +000012905 res = PyDict_SetItem(new, key, value);
12906 Py_DECREF(key);
12907 Py_DECREF(value);
12908 if (res < 0)
12909 goto err;
12910 }
12911 /* create entries for deleting chars in z */
12912 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 z_kind = PyUnicode_KIND(z);
12914 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012915 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012917 if (!key)
12918 goto err;
12919 res = PyDict_SetItem(new, key, Py_None);
12920 Py_DECREF(key);
12921 if (res < 0)
12922 goto err;
12923 }
12924 }
12925 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 int kind;
12927 void *data;
12928
Georg Brandlceee0772007-11-27 23:48:05 +000012929 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012930 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012931 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12932 "to maketrans it must be a dict");
12933 goto err;
12934 }
12935 /* copy entries into the new dict, converting string keys to int keys */
12936 while (PyDict_Next(x, &i, &key, &value)) {
12937 if (PyUnicode_Check(key)) {
12938 /* convert string keys to integer keys */
12939 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012940 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012941 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12942 "table must be of length 1");
12943 goto err;
12944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 kind = PyUnicode_KIND(key);
12946 data = PyUnicode_DATA(key);
12947 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012948 if (!newkey)
12949 goto err;
12950 res = PyDict_SetItem(new, newkey, value);
12951 Py_DECREF(newkey);
12952 if (res < 0)
12953 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012954 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012955 /* just keep integer keys */
12956 if (PyDict_SetItem(new, key, value) < 0)
12957 goto err;
12958 } else {
12959 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12960 "be strings or integers");
12961 goto err;
12962 }
12963 }
12964 }
12965 return new;
12966 err:
12967 Py_DECREF(new);
12968 return NULL;
12969}
12970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012971PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012972 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973\n\
12974Return a copy of the string S, where all characters have been mapped\n\
12975through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012976Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012977Unmapped characters are left untouched. Characters mapped to None\n\
12978are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979
12980static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984}
12985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012986PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012989Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990
12991static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012992unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012994 if (PyUnicode_READY(self) == -1)
12995 return NULL;
12996 if (PyUnicode_IS_ASCII(self))
12997 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012998 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999}
13000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013001PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013002 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013003\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013004Pad a numeric string S with zeros on the left, to fill a field\n\
13005of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006
13007static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013008unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013009{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013010 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013011 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013012 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 int kind;
13014 void *data;
13015 Py_UCS4 chr;
13016
Martin v. Löwis18e16552006-02-15 17:27:45 +000013017 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018 return NULL;
13019
Benjamin Petersonbac79492012-01-14 13:34:47 -050013020 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022
Victor Stinnerc4b49542011-12-11 22:44:26 +010013023 if (PyUnicode_GET_LENGTH(self) >= width)
13024 return unicode_result_unchanged(self);
13025
13026 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027
13028 u = pad(self, fill, 0, '0');
13029
Walter Dörwald068325e2002-04-15 13:36:47 +000013030 if (u == NULL)
13031 return NULL;
13032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 kind = PyUnicode_KIND(u);
13034 data = PyUnicode_DATA(u);
13035 chr = PyUnicode_READ(kind, data, fill);
13036
13037 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 PyUnicode_WRITE(kind, data, 0, chr);
13040 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041 }
13042
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013043 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013044 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046
13047#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013048static PyObject *
13049unicode__decimal2ascii(PyObject *self)
13050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013051 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013052}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053#endif
13054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013055PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013056 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013058Return True if S starts with the specified prefix, False otherwise.\n\
13059With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013060With optional end, stop comparing S at that position.\n\
13061prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062
13063static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013064unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013065 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013067 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013068 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013069 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013070 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013071 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072
Jesus Ceaac451502011-04-20 17:09:23 +020013073 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013075 if (PyTuple_Check(subobj)) {
13076 Py_ssize_t i;
13077 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013078 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013079 if (substring == NULL)
13080 return NULL;
13081 result = tailmatch(self, substring, start, end, -1);
13082 Py_DECREF(substring);
13083 if (result) {
13084 Py_RETURN_TRUE;
13085 }
13086 }
13087 /* nothing matched */
13088 Py_RETURN_FALSE;
13089 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013090 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013091 if (substring == NULL) {
13092 if (PyErr_ExceptionMatches(PyExc_TypeError))
13093 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13094 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013095 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013096 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013097 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013099 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100}
13101
13102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013103PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013106Return True if S ends with the specified suffix, False otherwise.\n\
13107With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013108With optional end, stop comparing S at that position.\n\
13109suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
13111static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013112unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013113 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013115 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013116 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013117 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013118 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013119 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120
Jesus Ceaac451502011-04-20 17:09:23 +020013121 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013123 if (PyTuple_Check(subobj)) {
13124 Py_ssize_t i;
13125 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013126 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013127 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013128 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013130 result = tailmatch(self, substring, start, end, +1);
13131 Py_DECREF(substring);
13132 if (result) {
13133 Py_RETURN_TRUE;
13134 }
13135 }
13136 Py_RETURN_FALSE;
13137 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013138 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013139 if (substring == NULL) {
13140 if (PyErr_ExceptionMatches(PyExc_TypeError))
13141 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13142 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013144 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013145 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013147 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148}
13149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013151
13152PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013153 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013154\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013155Return a formatted version of S, using substitutions from args and kwargs.\n\
13156The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013157
Eric Smith27bbca62010-11-04 17:06:58 +000013158PyDoc_STRVAR(format_map__doc__,
13159 "S.format_map(mapping) -> str\n\
13160\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013161Return a formatted version of S, using substitutions from mapping.\n\
13162The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013163
Eric Smith4a7d76d2008-05-30 18:10:19 +000013164static PyObject *
13165unicode__format__(PyObject* self, PyObject* args)
13166{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013167 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013168
13169 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13170 return NULL;
13171
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013172 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013174 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013175}
13176
Eric Smith8c663262007-08-25 02:26:07 +000013177PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013179\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013180Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013181
13182static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013183unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013184{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185 Py_ssize_t size;
13186
13187 /* If it's a compact object, account for base structure +
13188 character data. */
13189 if (PyUnicode_IS_COMPACT_ASCII(v))
13190 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13191 else if (PyUnicode_IS_COMPACT(v))
13192 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013193 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194 else {
13195 /* If it is a two-block object, account for base object, and
13196 for character block if present. */
13197 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013198 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013200 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 }
13202 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013203 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013204 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013206 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013207 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208
13209 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013210}
13211
13212PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013214
13215static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013216unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013217{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013218 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 if (!copy)
13220 return NULL;
13221 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013222}
13223
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013225 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013226 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013227 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13228 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013229 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13230 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013231 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013232 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13233 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13234 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13235 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13236 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013237 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013238 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13239 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13240 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013241 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013242 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13243 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13244 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013245 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013246 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013247 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013248 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013249 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13250 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13251 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13252 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13253 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13254 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13255 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13256 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13257 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13258 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13259 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13260 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13261 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13262 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013263 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013264 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013265 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013266 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013267 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013268 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013269 {"maketrans", (PyCFunction) unicode_maketrans,
13270 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013271 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013272#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013273 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013274 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013275#endif
13276
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278 {NULL, NULL}
13279};
13280
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013281static PyObject *
13282unicode_mod(PyObject *v, PyObject *w)
13283{
Brian Curtindfc80e32011-08-10 20:28:54 -050013284 if (!PyUnicode_Check(v))
13285 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013286 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013287}
13288
13289static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013290 0, /*nb_add*/
13291 0, /*nb_subtract*/
13292 0, /*nb_multiply*/
13293 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013294};
13295
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013297 (lenfunc) unicode_length, /* sq_length */
13298 PyUnicode_Concat, /* sq_concat */
13299 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13300 (ssizeargfunc) unicode_getitem, /* sq_item */
13301 0, /* sq_slice */
13302 0, /* sq_ass_item */
13303 0, /* sq_ass_slice */
13304 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305};
13306
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013307static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013308unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310 if (PyUnicode_READY(self) == -1)
13311 return NULL;
13312
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013313 if (PyIndex_Check(item)) {
13314 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013315 if (i == -1 && PyErr_Occurred())
13316 return NULL;
13317 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013319 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013320 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013321 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013322 PyObject *result;
13323 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013324 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013325 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013329 return NULL;
13330 }
13331
13332 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013333 Py_INCREF(unicode_empty);
13334 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013335 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013336 slicelength == PyUnicode_GET_LENGTH(self)) {
13337 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013338 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013339 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013340 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013341 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013342 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013343 src_kind = PyUnicode_KIND(self);
13344 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013345 if (!PyUnicode_IS_ASCII(self)) {
13346 kind_limit = kind_maxchar_limit(src_kind);
13347 max_char = 0;
13348 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13349 ch = PyUnicode_READ(src_kind, src_data, cur);
13350 if (ch > max_char) {
13351 max_char = ch;
13352 if (max_char >= kind_limit)
13353 break;
13354 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013355 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013356 }
Victor Stinner55c99112011-10-13 01:17:06 +020013357 else
13358 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013359 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013360 if (result == NULL)
13361 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013362 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013363 dest_data = PyUnicode_DATA(result);
13364
13365 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013366 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13367 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013368 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013369 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013370 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013371 } else {
13372 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13373 return NULL;
13374 }
13375}
13376
13377static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013378 (lenfunc)unicode_length, /* mp_length */
13379 (binaryfunc)unicode_subscript, /* mp_subscript */
13380 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013381};
13382
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384/* Helpers for PyUnicode_Format() */
13385
13386static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013387getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013389 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013390 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 (*p_argidx)++;
13392 if (arglen < 0)
13393 return args;
13394 else
13395 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396 }
13397 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013399 return NULL;
13400}
13401
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013402/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013404static PyObject *
13405formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013407 char *p;
13408 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013410
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411 x = PyFloat_AsDouble(v);
13412 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013413 return NULL;
13414
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013417
Eric Smith0923d1d2009-04-16 20:16:10 +000013418 p = PyOS_double_to_string(x, type, prec,
13419 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013420 if (p == NULL)
13421 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013422 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013423 PyMem_Free(p);
13424 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013425}
13426
Tim Peters38fd5b62000-09-21 05:43:11 +000013427static PyObject*
13428formatlong(PyObject *val, int flags, int prec, int type)
13429{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013430 char *buf;
13431 int len;
13432 PyObject *str; /* temporary string object. */
13433 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013434
Benjamin Peterson14339b62009-01-31 16:36:08 +000013435 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13436 if (!str)
13437 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013438 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013439 Py_DECREF(str);
13440 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013441}
13442
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013443static Py_UCS4
13444formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013446 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013447 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013448 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013449 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 goto onError;
13452 }
13453 else {
13454 /* Integer input truncated to a character */
13455 long x;
13456 x = PyLong_AsLong(v);
13457 if (x == -1 && PyErr_Occurred())
13458 goto onError;
13459
Victor Stinner8faf8212011-12-08 22:14:11 +010013460 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013461 PyErr_SetString(PyExc_OverflowError,
13462 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013463 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 }
13465
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013466 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013467 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013468
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013470 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013471 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013472 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013473}
13474
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013475static int
13476repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13477{
13478 int r;
13479 assert(count > 0);
13480 assert(PyUnicode_Check(obj));
13481 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013482 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013483 if (repeated == NULL)
13484 return -1;
13485 r = _PyAccu_Accumulate(acc, repeated);
13486 Py_DECREF(repeated);
13487 return r;
13488 }
13489 else {
13490 do {
13491 if (_PyAccu_Accumulate(acc, obj))
13492 return -1;
13493 } while (--count);
13494 return 0;
13495 }
13496}
13497
Alexander Belopolsky40018472011-02-26 01:02:56 +000013498PyObject *
13499PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013501 void *fmt;
13502 int fmtkind;
13503 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013504 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013505 int r;
13506 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013507 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013508 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013509 PyObject *temp = NULL;
13510 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013511 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013512 _PyAccu acc;
13513 static PyObject *plus, *minus, *blank, *zero, *percent;
13514
13515 if (!plus && !(plus = get_latin1_char('+')))
13516 return NULL;
13517 if (!minus && !(minus = get_latin1_char('-')))
13518 return NULL;
13519 if (!blank && !(blank = get_latin1_char(' ')))
13520 return NULL;
13521 if (!zero && !(zero = get_latin1_char('0')))
13522 return NULL;
13523 if (!percent && !(percent = get_latin1_char('%')))
13524 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013525
Guido van Rossumd57fd912000-03-10 22:53:23 +000013526 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013527 PyErr_BadInternalCall();
13528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013529 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013530 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013531 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013532 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013533 if (PyUnicode_READY(uformat) == -1)
13534 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013535 if (_PyAccu_Init(&acc))
13536 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013537 fmt = PyUnicode_DATA(uformat);
13538 fmtkind = PyUnicode_KIND(uformat);
13539 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13540 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013541
Guido van Rossumd57fd912000-03-10 22:53:23 +000013542 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 arglen = PyTuple_Size(args);
13544 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013545 }
13546 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013547 arglen = -1;
13548 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013549 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013550 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013551 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013553
13554 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013556 PyObject *nonfmt;
13557 Py_ssize_t nonfmtpos;
13558 nonfmtpos = fmtpos++;
13559 while (fmtcnt >= 0 &&
13560 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13561 fmtpos++;
13562 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013563 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013564 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013565 if (nonfmt == NULL)
13566 goto onError;
13567 r = _PyAccu_Accumulate(&acc, nonfmt);
13568 Py_DECREF(nonfmt);
13569 if (r)
13570 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013571 }
13572 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013573 /* Got a format specifier */
13574 int flags = 0;
13575 Py_ssize_t width = -1;
13576 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013577 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013578 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 int isnumok;
13580 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013581 void *pbuf = NULL;
13582 Py_ssize_t pindex, len;
13583 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013585 fmtpos++;
13586 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13587 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 Py_ssize_t keylen;
13589 PyObject *key;
13590 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013591
Benjamin Peterson29060642009-01-31 22:14:21 +000013592 if (dict == NULL) {
13593 PyErr_SetString(PyExc_TypeError,
13594 "format requires a mapping");
13595 goto onError;
13596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013599 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013600 /* Skip over balanced parentheses */
13601 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013604 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 if (fmtcnt < 0 || pcount > 0) {
13610 PyErr_SetString(PyExc_ValueError,
13611 "incomplete format key");
13612 goto onError;
13613 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013614 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013615 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013616 if (key == NULL)
13617 goto onError;
13618 if (args_owned) {
13619 Py_DECREF(args);
13620 args_owned = 0;
13621 }
13622 args = PyObject_GetItem(dict, key);
13623 Py_DECREF(key);
13624 if (args == NULL) {
13625 goto onError;
13626 }
13627 args_owned = 1;
13628 arglen = -1;
13629 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013630 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013631 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013632 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 case '-': flags |= F_LJUST; continue;
13634 case '+': flags |= F_SIGN; continue;
13635 case ' ': flags |= F_BLANK; continue;
13636 case '#': flags |= F_ALT; continue;
13637 case '0': flags |= F_ZERO; continue;
13638 }
13639 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013640 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013641 if (c == '*') {
13642 v = getnextarg(args, arglen, &argidx);
13643 if (v == NULL)
13644 goto onError;
13645 if (!PyLong_Check(v)) {
13646 PyErr_SetString(PyExc_TypeError,
13647 "* wants int");
13648 goto onError;
13649 }
13650 width = PyLong_AsLong(v);
13651 if (width == -1 && PyErr_Occurred())
13652 goto onError;
13653 if (width < 0) {
13654 flags |= F_LJUST;
13655 width = -width;
13656 }
13657 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013658 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013659 }
13660 else if (c >= '0' && c <= '9') {
13661 width = c - '0';
13662 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013663 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 if (c < '0' || c > '9')
13665 break;
13666 if ((width*10) / 10 != width) {
13667 PyErr_SetString(PyExc_ValueError,
13668 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013669 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 }
13671 width = width*10 + (c - '0');
13672 }
13673 }
13674 if (c == '.') {
13675 prec = 0;
13676 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013677 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 if (c == '*') {
13679 v = getnextarg(args, arglen, &argidx);
13680 if (v == NULL)
13681 goto onError;
13682 if (!PyLong_Check(v)) {
13683 PyErr_SetString(PyExc_TypeError,
13684 "* wants int");
13685 goto onError;
13686 }
13687 prec = PyLong_AsLong(v);
13688 if (prec == -1 && PyErr_Occurred())
13689 goto onError;
13690 if (prec < 0)
13691 prec = 0;
13692 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013693 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013694 }
13695 else if (c >= '0' && c <= '9') {
13696 prec = c - '0';
13697 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013698 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013699 if (c < '0' || c > '9')
13700 break;
13701 if ((prec*10) / 10 != prec) {
13702 PyErr_SetString(PyExc_ValueError,
13703 "prec too big");
13704 goto onError;
13705 }
13706 prec = prec*10 + (c - '0');
13707 }
13708 }
13709 } /* prec */
13710 if (fmtcnt >= 0) {
13711 if (c == 'h' || c == 'l' || c == 'L') {
13712 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013713 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 }
13715 }
13716 if (fmtcnt < 0) {
13717 PyErr_SetString(PyExc_ValueError,
13718 "incomplete format");
13719 goto onError;
13720 }
13721 if (c != '%') {
13722 v = getnextarg(args, arglen, &argidx);
13723 if (v == NULL)
13724 goto onError;
13725 }
13726 sign = 0;
13727 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013728 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 switch (c) {
13730
13731 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013732 _PyAccu_Accumulate(&acc, percent);
13733 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013734
13735 case 's':
13736 case 'r':
13737 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013738 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013739 temp = v;
13740 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013741 }
13742 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 if (c == 's')
13744 temp = PyObject_Str(v);
13745 else if (c == 'r')
13746 temp = PyObject_Repr(v);
13747 else
13748 temp = PyObject_ASCII(v);
13749 if (temp == NULL)
13750 goto onError;
13751 if (PyUnicode_Check(temp))
13752 /* nothing to do */;
13753 else {
13754 Py_DECREF(temp);
13755 PyErr_SetString(PyExc_TypeError,
13756 "%s argument has non-string str()");
13757 goto onError;
13758 }
13759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013760 if (PyUnicode_READY(temp) == -1) {
13761 Py_CLEAR(temp);
13762 goto onError;
13763 }
13764 pbuf = PyUnicode_DATA(temp);
13765 kind = PyUnicode_KIND(temp);
13766 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013767 if (prec >= 0 && len > prec)
13768 len = prec;
13769 break;
13770
13771 case 'i':
13772 case 'd':
13773 case 'u':
13774 case 'o':
13775 case 'x':
13776 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013777 isnumok = 0;
13778 if (PyNumber_Check(v)) {
13779 PyObject *iobj=NULL;
13780
13781 if (PyLong_Check(v)) {
13782 iobj = v;
13783 Py_INCREF(iobj);
13784 }
13785 else {
13786 iobj = PyNumber_Long(v);
13787 }
13788 if (iobj!=NULL) {
13789 if (PyLong_Check(iobj)) {
13790 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013791 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 Py_DECREF(iobj);
13793 if (!temp)
13794 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013795 if (PyUnicode_READY(temp) == -1) {
13796 Py_CLEAR(temp);
13797 goto onError;
13798 }
13799 pbuf = PyUnicode_DATA(temp);
13800 kind = PyUnicode_KIND(temp);
13801 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013802 sign = 1;
13803 }
13804 else {
13805 Py_DECREF(iobj);
13806 }
13807 }
13808 }
13809 if (!isnumok) {
13810 PyErr_Format(PyExc_TypeError,
13811 "%%%c format: a number is required, "
13812 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13813 goto onError;
13814 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013815 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013816 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013817 fillobj = zero;
13818 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013819 break;
13820
13821 case 'e':
13822 case 'E':
13823 case 'f':
13824 case 'F':
13825 case 'g':
13826 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013827 temp = formatfloat(v, flags, prec, c);
13828 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013829 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013830 if (PyUnicode_READY(temp) == -1) {
13831 Py_CLEAR(temp);
13832 goto onError;
13833 }
13834 pbuf = PyUnicode_DATA(temp);
13835 kind = PyUnicode_KIND(temp);
13836 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013837 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013838 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013839 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013840 fillobj = zero;
13841 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013842 break;
13843
13844 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013845 {
13846 Py_UCS4 ch = formatchar(v);
13847 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013849 temp = _PyUnicode_FromUCS4(&ch, 1);
13850 if (temp == NULL)
13851 goto onError;
13852 pbuf = PyUnicode_DATA(temp);
13853 kind = PyUnicode_KIND(temp);
13854 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013855 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013856 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013857
13858 default:
13859 PyErr_Format(PyExc_ValueError,
13860 "unsupported format character '%c' (0x%x) "
13861 "at index %zd",
13862 (31<=c && c<=126) ? (char)c : '?',
13863 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013864 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013865 goto onError;
13866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013867 /* pbuf is initialized here. */
13868 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013869 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013870 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13871 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013872 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013873 pindex++;
13874 }
13875 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13876 signobj = plus;
13877 len--;
13878 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013879 }
13880 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013881 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013882 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013883 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013884 else
13885 sign = 0;
13886 }
13887 if (width < len)
13888 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013889 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013890 if (fill != ' ') {
13891 assert(signobj != NULL);
13892 if (_PyAccu_Accumulate(&acc, signobj))
13893 goto onError;
13894 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013895 if (width > len)
13896 width--;
13897 }
13898 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013899 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013900 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013901 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013902 second = get_latin1_char(
13903 PyUnicode_READ(kind, pbuf, pindex + 1));
13904 pindex += 2;
13905 if (second == NULL ||
13906 _PyAccu_Accumulate(&acc, zero) ||
13907 _PyAccu_Accumulate(&acc, second))
13908 goto onError;
13909 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013910 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013911 width -= 2;
13912 if (width < 0)
13913 width = 0;
13914 len -= 2;
13915 }
13916 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013917 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013918 if (repeat_accumulate(&acc, fillobj, width - len))
13919 goto onError;
13920 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 }
13922 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013923 if (sign) {
13924 assert(signobj != NULL);
13925 if (_PyAccu_Accumulate(&acc, signobj))
13926 goto onError;
13927 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013928 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013929 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13930 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013931 second = get_latin1_char(
13932 PyUnicode_READ(kind, pbuf, pindex + 1));
13933 pindex += 2;
13934 if (second == NULL ||
13935 _PyAccu_Accumulate(&acc, zero) ||
13936 _PyAccu_Accumulate(&acc, second))
13937 goto onError;
13938 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013939 }
13940 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013941 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013942 if (temp != NULL) {
13943 assert(pbuf == PyUnicode_DATA(temp));
13944 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013945 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013946 else {
13947 const char *p = (const char *) pbuf;
13948 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013949 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013950 v = PyUnicode_FromKindAndData(kind, p, len);
13951 }
13952 if (v == NULL)
13953 goto onError;
13954 r = _PyAccu_Accumulate(&acc, v);
13955 Py_DECREF(v);
13956 if (r)
13957 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013958 if (width > len && repeat_accumulate(&acc, blank, width - len))
13959 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013960 if (dict && (argidx < arglen) && c != '%') {
13961 PyErr_SetString(PyExc_TypeError,
13962 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013963 goto onError;
13964 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013965 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013966 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013967 } /* until end */
13968 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013969 PyErr_SetString(PyExc_TypeError,
13970 "not all arguments converted during string formatting");
13971 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013972 }
13973
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013974 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013975 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013976 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013977 }
13978 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013979 Py_XDECREF(temp);
13980 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013981 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013982
Benjamin Peterson29060642009-01-31 22:14:21 +000013983 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013984 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013985 Py_XDECREF(temp);
13986 Py_XDECREF(second);
13987 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013988 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013989 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013990 }
13991 return NULL;
13992}
13993
Jeremy Hylton938ace62002-07-17 16:30:39 +000013994static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013995unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13996
Tim Peters6d6c1a32001-08-02 04:15:00 +000013997static PyObject *
13998unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13999{
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014001 static char *kwlist[] = {"object", "encoding", "errors", 0};
14002 char *encoding = NULL;
14003 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014004
Benjamin Peterson14339b62009-01-31 16:36:08 +000014005 if (type != &PyUnicode_Type)
14006 return unicode_subtype_new(type, args, kwds);
14007 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014008 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014009 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014010 if (x == NULL) {
14011 Py_INCREF(unicode_empty);
14012 return unicode_empty;
14013 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014014 if (encoding == NULL && errors == NULL)
14015 return PyObject_Str(x);
14016 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014017 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014018}
14019
Guido van Rossume023fe02001-08-30 03:12:59 +000014020static PyObject *
14021unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14022{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014023 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014024 Py_ssize_t length, char_size;
14025 int share_wstr, share_utf8;
14026 unsigned int kind;
14027 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014028
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014030
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014031 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014032 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014033 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014034 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014035 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014036 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014037 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014038 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014039
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014040 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014041 if (self == NULL) {
14042 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014043 return NULL;
14044 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014045 kind = PyUnicode_KIND(unicode);
14046 length = PyUnicode_GET_LENGTH(unicode);
14047
14048 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014049#ifdef Py_DEBUG
14050 _PyUnicode_HASH(self) = -1;
14051#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014053#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014054 _PyUnicode_STATE(self).interned = 0;
14055 _PyUnicode_STATE(self).kind = kind;
14056 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014057 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014058 _PyUnicode_STATE(self).ready = 1;
14059 _PyUnicode_WSTR(self) = NULL;
14060 _PyUnicode_UTF8_LENGTH(self) = 0;
14061 _PyUnicode_UTF8(self) = NULL;
14062 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014063 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014064
14065 share_utf8 = 0;
14066 share_wstr = 0;
14067 if (kind == PyUnicode_1BYTE_KIND) {
14068 char_size = 1;
14069 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14070 share_utf8 = 1;
14071 }
14072 else if (kind == PyUnicode_2BYTE_KIND) {
14073 char_size = 2;
14074 if (sizeof(wchar_t) == 2)
14075 share_wstr = 1;
14076 }
14077 else {
14078 assert(kind == PyUnicode_4BYTE_KIND);
14079 char_size = 4;
14080 if (sizeof(wchar_t) == 4)
14081 share_wstr = 1;
14082 }
14083
14084 /* Ensure we won't overflow the length. */
14085 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14086 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014087 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014088 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014089 data = PyObject_MALLOC((length + 1) * char_size);
14090 if (data == NULL) {
14091 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014092 goto onError;
14093 }
14094
Victor Stinnerc3c74152011-10-02 20:39:55 +020014095 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014096 if (share_utf8) {
14097 _PyUnicode_UTF8_LENGTH(self) = length;
14098 _PyUnicode_UTF8(self) = data;
14099 }
14100 if (share_wstr) {
14101 _PyUnicode_WSTR_LENGTH(self) = length;
14102 _PyUnicode_WSTR(self) = (wchar_t *)data;
14103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014104
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014105 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014106 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014107 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014108#ifdef Py_DEBUG
14109 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14110#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014111 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014112 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014113
14114onError:
14115 Py_DECREF(unicode);
14116 Py_DECREF(self);
14117 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014118}
14119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014120PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014121 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014122\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014123Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014124encoding defaults to the current default string encoding.\n\
14125errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014126
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014127static PyObject *unicode_iter(PyObject *seq);
14128
Guido van Rossumd57fd912000-03-10 22:53:23 +000014129PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014130 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014131 "str", /* tp_name */
14132 sizeof(PyUnicodeObject), /* tp_size */
14133 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014134 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014135 (destructor)unicode_dealloc, /* tp_dealloc */
14136 0, /* tp_print */
14137 0, /* tp_getattr */
14138 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014139 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014140 unicode_repr, /* tp_repr */
14141 &unicode_as_number, /* tp_as_number */
14142 &unicode_as_sequence, /* tp_as_sequence */
14143 &unicode_as_mapping, /* tp_as_mapping */
14144 (hashfunc) unicode_hash, /* tp_hash*/
14145 0, /* tp_call*/
14146 (reprfunc) unicode_str, /* tp_str */
14147 PyObject_GenericGetAttr, /* tp_getattro */
14148 0, /* tp_setattro */
14149 0, /* tp_as_buffer */
14150 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014151 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 unicode_doc, /* tp_doc */
14153 0, /* tp_traverse */
14154 0, /* tp_clear */
14155 PyUnicode_RichCompare, /* tp_richcompare */
14156 0, /* tp_weaklistoffset */
14157 unicode_iter, /* tp_iter */
14158 0, /* tp_iternext */
14159 unicode_methods, /* tp_methods */
14160 0, /* tp_members */
14161 0, /* tp_getset */
14162 &PyBaseObject_Type, /* tp_base */
14163 0, /* tp_dict */
14164 0, /* tp_descr_get */
14165 0, /* tp_descr_set */
14166 0, /* tp_dictoffset */
14167 0, /* tp_init */
14168 0, /* tp_alloc */
14169 unicode_new, /* tp_new */
14170 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014171};
14172
14173/* Initialize the Unicode implementation */
14174
Victor Stinner3a50e702011-10-18 21:21:00 +020014175int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014176{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014177 int i;
14178
Thomas Wouters477c8d52006-05-27 19:21:47 +000014179 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014180 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014181 0x000A, /* LINE FEED */
14182 0x000D, /* CARRIAGE RETURN */
14183 0x001C, /* FILE SEPARATOR */
14184 0x001D, /* GROUP SEPARATOR */
14185 0x001E, /* RECORD SEPARATOR */
14186 0x0085, /* NEXT LINE */
14187 0x2028, /* LINE SEPARATOR */
14188 0x2029, /* PARAGRAPH SEPARATOR */
14189 };
14190
Fred Drakee4315f52000-05-09 19:53:39 +000014191 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014192 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014193 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014194 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014197 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014198 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014199 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014200 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014201
14202 /* initialize the linebreak bloom filter */
14203 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014204 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014205 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014206
14207 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014208
14209#ifdef HAVE_MBCS
14210 winver.dwOSVersionInfoSize = sizeof(winver);
14211 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14212 PyErr_SetFromWindowsErr(0);
14213 return -1;
14214 }
14215#endif
14216 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014217}
14218
14219/* Finalize the Unicode implementation */
14220
Christian Heimesa156e092008-02-16 07:38:31 +000014221int
14222PyUnicode_ClearFreeList(void)
14223{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014224 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014225}
14226
Guido van Rossumd57fd912000-03-10 22:53:23 +000014227void
Thomas Wouters78890102000-07-22 19:25:51 +000014228_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014229{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014230 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014231
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014232 Py_XDECREF(unicode_empty);
14233 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014234
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014235 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014236 if (unicode_latin1[i]) {
14237 Py_DECREF(unicode_latin1[i]);
14238 unicode_latin1[i] = NULL;
14239 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014240 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014241 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014242 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014243}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014244
Walter Dörwald16807132007-05-25 13:52:07 +000014245void
14246PyUnicode_InternInPlace(PyObject **p)
14247{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014248 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014249 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014250#ifdef Py_DEBUG
14251 assert(s != NULL);
14252 assert(_PyUnicode_CHECK(s));
14253#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014254 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014255 return;
14256#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014257 /* If it's a subclass, we don't really know what putting
14258 it in the interned dict might do. */
14259 if (!PyUnicode_CheckExact(s))
14260 return;
14261 if (PyUnicode_CHECK_INTERNED(s))
14262 return;
14263 if (interned == NULL) {
14264 interned = PyDict_New();
14265 if (interned == NULL) {
14266 PyErr_Clear(); /* Don't leave an exception */
14267 return;
14268 }
14269 }
14270 /* It might be that the GetItem call fails even
14271 though the key is present in the dictionary,
14272 namely when this happens during a stack overflow. */
14273 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014274 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014275 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014276
Benjamin Peterson29060642009-01-31 22:14:21 +000014277 if (t) {
14278 Py_INCREF(t);
14279 Py_DECREF(*p);
14280 *p = t;
14281 return;
14282 }
Walter Dörwald16807132007-05-25 13:52:07 +000014283
Benjamin Peterson14339b62009-01-31 16:36:08 +000014284 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014285 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 PyErr_Clear();
14287 PyThreadState_GET()->recursion_critical = 0;
14288 return;
14289 }
14290 PyThreadState_GET()->recursion_critical = 0;
14291 /* The two references in interned are not counted by refcnt.
14292 The deallocator will take care of this */
14293 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014294 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014295}
14296
14297void
14298PyUnicode_InternImmortal(PyObject **p)
14299{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014300 PyUnicode_InternInPlace(p);
14301 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014302 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 Py_INCREF(*p);
14304 }
Walter Dörwald16807132007-05-25 13:52:07 +000014305}
14306
14307PyObject *
14308PyUnicode_InternFromString(const char *cp)
14309{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014310 PyObject *s = PyUnicode_FromString(cp);
14311 if (s == NULL)
14312 return NULL;
14313 PyUnicode_InternInPlace(&s);
14314 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014315}
14316
Alexander Belopolsky40018472011-02-26 01:02:56 +000014317void
14318_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014319{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014320 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014321 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014322 Py_ssize_t i, n;
14323 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014324
Benjamin Peterson14339b62009-01-31 16:36:08 +000014325 if (interned == NULL || !PyDict_Check(interned))
14326 return;
14327 keys = PyDict_Keys(interned);
14328 if (keys == NULL || !PyList_Check(keys)) {
14329 PyErr_Clear();
14330 return;
14331 }
Walter Dörwald16807132007-05-25 13:52:07 +000014332
Benjamin Peterson14339b62009-01-31 16:36:08 +000014333 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14334 detector, interned unicode strings are not forcibly deallocated;
14335 rather, we give them their stolen references back, and then clear
14336 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014337
Benjamin Peterson14339b62009-01-31 16:36:08 +000014338 n = PyList_GET_SIZE(keys);
14339 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014340 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014342 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014343 if (PyUnicode_READY(s) == -1) {
14344 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014345 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014346 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014347 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014348 case SSTATE_NOT_INTERNED:
14349 /* XXX Shouldn't happen */
14350 break;
14351 case SSTATE_INTERNED_IMMORTAL:
14352 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014353 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014354 break;
14355 case SSTATE_INTERNED_MORTAL:
14356 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014357 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014358 break;
14359 default:
14360 Py_FatalError("Inconsistent interned string state.");
14361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014362 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014363 }
14364 fprintf(stderr, "total size of all interned strings: "
14365 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14366 "mortal/immortal\n", mortal_size, immortal_size);
14367 Py_DECREF(keys);
14368 PyDict_Clear(interned);
14369 Py_DECREF(interned);
14370 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014371}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014372
14373
14374/********************* Unicode Iterator **************************/
14375
14376typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 PyObject_HEAD
14378 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014379 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014380} unicodeiterobject;
14381
14382static void
14383unicodeiter_dealloc(unicodeiterobject *it)
14384{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014385 _PyObject_GC_UNTRACK(it);
14386 Py_XDECREF(it->it_seq);
14387 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014388}
14389
14390static int
14391unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14392{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014393 Py_VISIT(it->it_seq);
14394 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014395}
14396
14397static PyObject *
14398unicodeiter_next(unicodeiterobject *it)
14399{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014400 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014401
Benjamin Peterson14339b62009-01-31 16:36:08 +000014402 assert(it != NULL);
14403 seq = it->it_seq;
14404 if (seq == NULL)
14405 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014406 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014408 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14409 int kind = PyUnicode_KIND(seq);
14410 void *data = PyUnicode_DATA(seq);
14411 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14412 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 if (item != NULL)
14414 ++it->it_index;
14415 return item;
14416 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014417
Benjamin Peterson14339b62009-01-31 16:36:08 +000014418 Py_DECREF(seq);
14419 it->it_seq = NULL;
14420 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014421}
14422
14423static PyObject *
14424unicodeiter_len(unicodeiterobject *it)
14425{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014426 Py_ssize_t len = 0;
14427 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014428 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014429 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014430}
14431
14432PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14433
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014434static PyObject *
14435unicodeiter_reduce(unicodeiterobject *it)
14436{
14437 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014438 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014439 it->it_seq, it->it_index);
14440 } else {
14441 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14442 if (u == NULL)
14443 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014444 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014445 }
14446}
14447
14448PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14449
14450static PyObject *
14451unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14452{
14453 Py_ssize_t index = PyLong_AsSsize_t(state);
14454 if (index == -1 && PyErr_Occurred())
14455 return NULL;
14456 if (index < 0)
14457 index = 0;
14458 it->it_index = index;
14459 Py_RETURN_NONE;
14460}
14461
14462PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14463
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014464static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014465 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014466 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014467 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14468 reduce_doc},
14469 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14470 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014471 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014472};
14473
14474PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014475 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14476 "str_iterator", /* tp_name */
14477 sizeof(unicodeiterobject), /* tp_basicsize */
14478 0, /* tp_itemsize */
14479 /* methods */
14480 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14481 0, /* tp_print */
14482 0, /* tp_getattr */
14483 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014484 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014485 0, /* tp_repr */
14486 0, /* tp_as_number */
14487 0, /* tp_as_sequence */
14488 0, /* tp_as_mapping */
14489 0, /* tp_hash */
14490 0, /* tp_call */
14491 0, /* tp_str */
14492 PyObject_GenericGetAttr, /* tp_getattro */
14493 0, /* tp_setattro */
14494 0, /* tp_as_buffer */
14495 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14496 0, /* tp_doc */
14497 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14498 0, /* tp_clear */
14499 0, /* tp_richcompare */
14500 0, /* tp_weaklistoffset */
14501 PyObject_SelfIter, /* tp_iter */
14502 (iternextfunc)unicodeiter_next, /* tp_iternext */
14503 unicodeiter_methods, /* tp_methods */
14504 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014505};
14506
14507static PyObject *
14508unicode_iter(PyObject *seq)
14509{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014510 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014511
Benjamin Peterson14339b62009-01-31 16:36:08 +000014512 if (!PyUnicode_Check(seq)) {
14513 PyErr_BadInternalCall();
14514 return NULL;
14515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014516 if (PyUnicode_READY(seq) == -1)
14517 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014518 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14519 if (it == NULL)
14520 return NULL;
14521 it->it_index = 0;
14522 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014523 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014524 _PyObject_GC_TRACK(it);
14525 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014526}
14527
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014528
14529size_t
14530Py_UNICODE_strlen(const Py_UNICODE *u)
14531{
14532 int res = 0;
14533 while(*u++)
14534 res++;
14535 return res;
14536}
14537
14538Py_UNICODE*
14539Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14540{
14541 Py_UNICODE *u = s1;
14542 while ((*u++ = *s2++));
14543 return s1;
14544}
14545
14546Py_UNICODE*
14547Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14548{
14549 Py_UNICODE *u = s1;
14550 while ((*u++ = *s2++))
14551 if (n-- == 0)
14552 break;
14553 return s1;
14554}
14555
14556Py_UNICODE*
14557Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14558{
14559 Py_UNICODE *u1 = s1;
14560 u1 += Py_UNICODE_strlen(u1);
14561 Py_UNICODE_strcpy(u1, s2);
14562 return s1;
14563}
14564
14565int
14566Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14567{
14568 while (*s1 && *s2 && *s1 == *s2)
14569 s1++, s2++;
14570 if (*s1 && *s2)
14571 return (*s1 < *s2) ? -1 : +1;
14572 if (*s1)
14573 return 1;
14574 if (*s2)
14575 return -1;
14576 return 0;
14577}
14578
14579int
14580Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14581{
14582 register Py_UNICODE u1, u2;
14583 for (; n != 0; n--) {
14584 u1 = *s1;
14585 u2 = *s2;
14586 if (u1 != u2)
14587 return (u1 < u2) ? -1 : +1;
14588 if (u1 == '\0')
14589 return 0;
14590 s1++;
14591 s2++;
14592 }
14593 return 0;
14594}
14595
14596Py_UNICODE*
14597Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14598{
14599 const Py_UNICODE *p;
14600 for (p = s; *p; p++)
14601 if (*p == c)
14602 return (Py_UNICODE*)p;
14603 return NULL;
14604}
14605
14606Py_UNICODE*
14607Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14608{
14609 const Py_UNICODE *p;
14610 p = s + Py_UNICODE_strlen(s);
14611 while (p != s) {
14612 p--;
14613 if (*p == c)
14614 return (Py_UNICODE*)p;
14615 }
14616 return NULL;
14617}
Victor Stinner331ea922010-08-10 16:37:20 +000014618
Victor Stinner71133ff2010-09-01 23:43:53 +000014619Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014620PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014621{
Victor Stinner577db2c2011-10-11 22:12:48 +020014622 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014623 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014625 if (!PyUnicode_Check(unicode)) {
14626 PyErr_BadArgument();
14627 return NULL;
14628 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014629 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014630 if (u == NULL)
14631 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014632 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014633 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014634 PyErr_NoMemory();
14635 return NULL;
14636 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014637 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014638 size *= sizeof(Py_UNICODE);
14639 copy = PyMem_Malloc(size);
14640 if (copy == NULL) {
14641 PyErr_NoMemory();
14642 return NULL;
14643 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014644 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014645 return copy;
14646}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014647
Georg Brandl66c221e2010-10-14 07:04:07 +000014648/* A _string module, to export formatter_parser and formatter_field_name_split
14649 to the string.Formatter class implemented in Python. */
14650
14651static PyMethodDef _string_methods[] = {
14652 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14653 METH_O, PyDoc_STR("split the argument as a field name")},
14654 {"formatter_parser", (PyCFunction) formatter_parser,
14655 METH_O, PyDoc_STR("parse the argument as a format string")},
14656 {NULL, NULL}
14657};
14658
14659static struct PyModuleDef _string_module = {
14660 PyModuleDef_HEAD_INIT,
14661 "_string",
14662 PyDoc_STR("string helper module"),
14663 0,
14664 _string_methods,
14665 NULL,
14666 NULL,
14667 NULL,
14668 NULL
14669};
14670
14671PyMODINIT_FUNC
14672PyInit__string(void)
14673{
14674 return PyModule_Create(&_string_module);
14675}
14676
14677
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014678#ifdef __cplusplus
14679}
14680#endif