blob: 95ecfe2ae090f49990149ba057135d1df442bbf6 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001001 if (maxchar > MAX_UNICODE) {
1002 PyErr_SetString(PyExc_SystemError,
1003 "invalid maximum character passed to PyUnicode_New");
1004 return NULL;
1005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 kind_state = PyUnicode_4BYTE_KIND;
1007 char_size = 4;
1008 if (sizeof(wchar_t) == 4)
1009 is_sharing = 1;
1010 }
1011
1012 /* Ensure we won't overflow the size. */
1013 if (size < 0) {
1014 PyErr_SetString(PyExc_SystemError,
1015 "Negative size passed to PyUnicode_New");
1016 return NULL;
1017 }
1018 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1019 return PyErr_NoMemory();
1020
1021 /* Duplicated allocation code from _PyObject_New() instead of a call to
1022 * PyObject_New() so we are able to allocate space for the object and
1023 * it's data buffer.
1024 */
1025 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1026 if (obj == NULL)
1027 return PyErr_NoMemory();
1028 obj = PyObject_INIT(obj, &PyUnicode_Type);
1029 if (obj == NULL)
1030 return NULL;
1031
1032 unicode = (PyCompactUnicodeObject *)obj;
1033 if (is_ascii)
1034 data = ((PyASCIIObject*)obj) + 1;
1035 else
1036 data = unicode + 1;
1037 _PyUnicode_LENGTH(unicode) = size;
1038 _PyUnicode_HASH(unicode) = -1;
1039 _PyUnicode_STATE(unicode).interned = 0;
1040 _PyUnicode_STATE(unicode).kind = kind_state;
1041 _PyUnicode_STATE(unicode).compact = 1;
1042 _PyUnicode_STATE(unicode).ready = 1;
1043 _PyUnicode_STATE(unicode).ascii = is_ascii;
1044 if (is_ascii) {
1045 ((char*)data)[size] = 0;
1046 _PyUnicode_WSTR(unicode) = NULL;
1047 }
1048 else if (kind_state == PyUnicode_1BYTE_KIND) {
1049 ((char*)data)[size] = 0;
1050 _PyUnicode_WSTR(unicode) = NULL;
1051 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001053 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 }
1055 else {
1056 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001057 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 if (kind_state == PyUnicode_2BYTE_KIND)
1059 ((Py_UCS2*)data)[size] = 0;
1060 else /* kind_state == PyUnicode_4BYTE_KIND */
1061 ((Py_UCS4*)data)[size] = 0;
1062 if (is_sharing) {
1063 _PyUnicode_WSTR_LENGTH(unicode) = size;
1064 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1065 }
1066 else {
1067 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1068 _PyUnicode_WSTR(unicode) = NULL;
1069 }
1070 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001071 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 return obj;
1073}
1074
1075#if SIZEOF_WCHAR_T == 2
1076/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1077 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001078 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079
1080 This function assumes that unicode can hold one more code point than wstr
1081 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001082static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001084 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085{
1086 const wchar_t *iter;
1087 Py_UCS4 *ucs4_out;
1088
Victor Stinner910337b2011-10-03 03:20:16 +02001089 assert(unicode != NULL);
1090 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1092 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1093
1094 for (iter = begin; iter < end; ) {
1095 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1096 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001097 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1098 && (iter+1) < end
1099 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100 {
Victor Stinner551ac952011-11-29 22:58:13 +01001101 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 iter += 2;
1103 }
1104 else {
1105 *ucs4_out++ = *iter;
1106 iter++;
1107 }
1108 }
1109 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1110 _PyUnicode_GET_LENGTH(unicode)));
1111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112}
1113#endif
1114
Victor Stinnercd9950f2011-10-02 00:34:53 +02001115static int
Victor Stinner488fa492011-12-12 00:01:39 +01001116unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001117{
Victor Stinner488fa492011-12-12 00:01:39 +01001118 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001119 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001120 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001121 return -1;
1122 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001123 return 0;
1124}
1125
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001126static int
1127_copy_characters(PyObject *to, Py_ssize_t to_start,
1128 PyObject *from, Py_ssize_t from_start,
1129 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001131 unsigned int from_kind, to_kind;
1132 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001133 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_Check(from));
1136 assert(PyUnicode_Check(to));
1137 assert(PyUnicode_IS_READY(from));
1138 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001140 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1141 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1142 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001144 if (how_many == 0)
1145 return 0;
1146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001148 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001150 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152#ifdef Py_DEBUG
1153 if (!check_maxchar
1154 && (from_kind > to_kind
1155 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001157 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1158 Py_UCS4 ch;
1159 Py_ssize_t i;
1160 for (i=0; i < how_many; i++) {
1161 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1162 assert(ch <= to_maxchar);
1163 }
1164 }
1165#endif
1166 fast = (from_kind == to_kind);
1167 if (check_maxchar
1168 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1169 {
1170 /* deny latin1 => ascii */
1171 fast = 0;
1172 }
1173
1174 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001175 Py_MEMCPY((char*)to_data + to_kind * to_start,
1176 (char*)from_data + from_kind * from_start,
1177 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001179 else if (from_kind == PyUnicode_1BYTE_KIND
1180 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001181 {
1182 _PyUnicode_CONVERT_BYTES(
1183 Py_UCS1, Py_UCS2,
1184 PyUnicode_1BYTE_DATA(from) + from_start,
1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186 PyUnicode_2BYTE_DATA(to) + to_start
1187 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001188 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001189 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001190 && to_kind == PyUnicode_4BYTE_KIND)
1191 {
1192 _PyUnicode_CONVERT_BYTES(
1193 Py_UCS1, Py_UCS4,
1194 PyUnicode_1BYTE_DATA(from) + from_start,
1195 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1196 PyUnicode_4BYTE_DATA(to) + to_start
1197 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001198 }
1199 else if (from_kind == PyUnicode_2BYTE_KIND
1200 && to_kind == PyUnicode_4BYTE_KIND)
1201 {
1202 _PyUnicode_CONVERT_BYTES(
1203 Py_UCS2, Py_UCS4,
1204 PyUnicode_2BYTE_DATA(from) + from_start,
1205 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1206 PyUnicode_4BYTE_DATA(to) + to_start
1207 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001208 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001210 /* check if max_char(from substring) <= max_char(to) */
1211 if (from_kind > to_kind
1212 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001213 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001215 /* slow path to check for character overflow */
1216 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001217 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 Py_ssize_t i;
1219
Victor Stinner56c161a2011-10-06 02:47:11 +02001220#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001221 for (i=0; i < how_many; i++) {
1222 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001223 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001224 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1225 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001226#else
1227 if (!check_maxchar) {
1228 for (i=0; i < how_many; i++) {
1229 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1230 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1231 }
1232 }
1233 else {
1234 for (i=0; i < how_many; i++) {
1235 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1236 if (ch > to_maxchar)
1237 return 1;
1238 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1239 }
1240 }
1241#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001244 assert(0 && "inconsistent state");
1245 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001246 }
1247 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001248 return 0;
1249}
1250
1251static void
1252copy_characters(PyObject *to, Py_ssize_t to_start,
1253 PyObject *from, Py_ssize_t from_start,
1254 Py_ssize_t how_many)
1255{
1256 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1257}
1258
1259Py_ssize_t
1260PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1261 PyObject *from, Py_ssize_t from_start,
1262 Py_ssize_t how_many)
1263{
1264 int err;
1265
1266 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1267 PyErr_BadInternalCall();
1268 return -1;
1269 }
1270
Benjamin Petersonbac79492012-01-14 13:34:47 -05001271 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001272 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001273 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001274 return -1;
1275
1276 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1277 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1278 PyErr_Format(PyExc_SystemError,
1279 "Cannot write %zi characters at %zi "
1280 "in a string of %zi characters",
1281 how_many, to_start, PyUnicode_GET_LENGTH(to));
1282 return -1;
1283 }
1284
1285 if (how_many == 0)
1286 return 0;
1287
Victor Stinner488fa492011-12-12 00:01:39 +01001288 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001289 return -1;
1290
1291 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1292 if (err) {
1293 PyErr_Format(PyExc_SystemError,
1294 "Cannot copy %s characters "
1295 "into a string of %s characters",
1296 unicode_kind_name(from),
1297 unicode_kind_name(to));
1298 return -1;
1299 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001300 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301}
1302
Victor Stinner17222162011-09-28 22:15:37 +02001303/* Find the maximum code point and count the number of surrogate pairs so a
1304 correct string length can be computed before converting a string to UCS4.
1305 This function counts single surrogates as a character and not as a pair.
1306
1307 Return 0 on success, or -1 on error. */
1308static int
1309find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1310 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311{
1312 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001313 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314
Victor Stinnerc53be962011-10-02 21:33:54 +02001315 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 *num_surrogates = 0;
1317 *maxchar = 0;
1318
1319 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001321 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1322 && (iter+1) < end
1323 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001325 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 iter += 2;
1328 }
1329 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001331 {
1332 ch = *iter;
1333 iter++;
1334 }
1335 if (ch > *maxchar) {
1336 *maxchar = ch;
1337 if (*maxchar > MAX_UNICODE) {
1338 PyErr_Format(PyExc_ValueError,
1339 "character U+%x is not in range [U+0000; U+10ffff]",
1340 ch);
1341 return -1;
1342 }
1343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 }
1345 return 0;
1346}
1347
1348#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001349static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350#endif
1351
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001352int
1353_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 wchar_t *end;
1356 Py_UCS4 maxchar = 0;
1357 Py_ssize_t num_surrogates;
1358#if SIZEOF_WCHAR_T == 2
1359 Py_ssize_t length_wo_surrogates;
1360#endif
1361
Georg Brandl7597add2011-10-05 16:36:47 +02001362 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001363 strings were created using _PyObject_New() and where no canonical
1364 representation (the str field) has been set yet aka strings
1365 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001366 assert(_PyUnicode_CHECK(unicode));
1367 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001370 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001371 /* Actually, it should neither be interned nor be anything else: */
1372 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373
1374#ifdef Py_DEBUG
1375 ++unicode_ready_calls;
1376#endif
1377
1378 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001379 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001380 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382
1383 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001384 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1385 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 PyErr_NoMemory();
1387 return -1;
1388 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001389 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 _PyUnicode_WSTR(unicode), end,
1391 PyUnicode_1BYTE_DATA(unicode));
1392 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1393 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1394 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1395 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001397 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001398 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001401 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001402 _PyUnicode_UTF8(unicode) = NULL;
1403 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 }
1405 PyObject_FREE(_PyUnicode_WSTR(unicode));
1406 _PyUnicode_WSTR(unicode) = NULL;
1407 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1408 }
1409 /* In this case we might have to convert down from 4-byte native
1410 wchar_t to 2-byte unicode. */
1411 else if (maxchar < 65536) {
1412 assert(num_surrogates == 0 &&
1413 "FindMaxCharAndNumSurrogatePairs() messed up");
1414
Victor Stinner506f5922011-09-28 22:34:18 +02001415#if SIZEOF_WCHAR_T == 2
1416 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001418 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1419 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1420 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001421 _PyUnicode_UTF8(unicode) = NULL;
1422 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001423#else
1424 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001425 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001426 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001428 PyErr_NoMemory();
1429 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 }
Victor Stinner506f5922011-09-28 22:34:18 +02001431 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1432 _PyUnicode_WSTR(unicode), end,
1433 PyUnicode_2BYTE_DATA(unicode));
1434 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1435 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1436 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001439 PyObject_FREE(_PyUnicode_WSTR(unicode));
1440 _PyUnicode_WSTR(unicode) = NULL;
1441 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1442#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 }
1444 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1445 else {
1446#if SIZEOF_WCHAR_T == 2
1447 /* in case the native representation is 2-bytes, we need to allocate a
1448 new normalized 4-byte version. */
1449 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001450 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1451 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 PyErr_NoMemory();
1453 return -1;
1454 }
1455 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1456 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001457 _PyUnicode_UTF8(unicode) = NULL;
1458 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001459 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1460 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001461 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 PyObject_FREE(_PyUnicode_WSTR(unicode));
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1465#else
1466 assert(num_surrogates == 0);
1467
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 _PyUnicode_UTF8(unicode) = NULL;
1471 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1473#endif
1474 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1475 }
1476 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001477 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 return 0;
1479}
1480
Alexander Belopolsky40018472011-02-26 01:02:56 +00001481static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001482unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483{
Walter Dörwald16807132007-05-25 13:52:07 +00001484 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 case SSTATE_NOT_INTERNED:
1486 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001487
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 case SSTATE_INTERNED_MORTAL:
1489 /* revive dead object temporarily for DelItem */
1490 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001491 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001492 Py_FatalError(
1493 "deletion of interned string failed");
1494 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001495
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 case SSTATE_INTERNED_IMMORTAL:
1497 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001498
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 default:
1500 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001501 }
1502
Victor Stinner03490912011-10-03 23:45:12 +02001503 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001505 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001506 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001507 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1508 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001509
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001510 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511}
1512
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513#ifdef Py_DEBUG
1514static int
1515unicode_is_singleton(PyObject *unicode)
1516{
1517 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1518 if (unicode == unicode_empty)
1519 return 1;
1520 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1521 {
1522 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1523 if (ch < 256 && unicode_latin1[ch] == unicode)
1524 return 1;
1525 }
1526 return 0;
1527}
1528#endif
1529
Alexander Belopolsky40018472011-02-26 01:02:56 +00001530static int
Victor Stinner488fa492011-12-12 00:01:39 +01001531unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001532{
Victor Stinner488fa492011-12-12 00:01:39 +01001533 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001534 if (Py_REFCNT(unicode) != 1)
1535 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001536 if (_PyUnicode_HASH(unicode) != -1)
1537 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001538 if (PyUnicode_CHECK_INTERNED(unicode))
1539 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001540 if (!PyUnicode_CheckExact(unicode))
1541 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001542#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001543 /* singleton refcount is greater than 1 */
1544 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001545#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001546 return 1;
1547}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001548
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549static int
1550unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1551{
1552 PyObject *unicode;
1553 Py_ssize_t old_length;
1554
1555 assert(p_unicode != NULL);
1556 unicode = *p_unicode;
1557
1558 assert(unicode != NULL);
1559 assert(PyUnicode_Check(unicode));
1560 assert(0 <= length);
1561
Victor Stinner910337b2011-10-03 03:20:16 +02001562 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001563 old_length = PyUnicode_WSTR_LENGTH(unicode);
1564 else
1565 old_length = PyUnicode_GET_LENGTH(unicode);
1566 if (old_length == length)
1567 return 0;
1568
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001569 if (length == 0) {
1570 Py_DECREF(*p_unicode);
1571 *p_unicode = unicode_empty;
1572 Py_INCREF(*p_unicode);
1573 return 0;
1574 }
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001577 PyObject *copy = resize_copy(unicode, length);
1578 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001579 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 Py_DECREF(*p_unicode);
1581 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583 }
1584
Victor Stinnerfe226c02011-10-03 03:52:20 +02001585 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001586 PyObject *new_unicode = resize_compact(unicode, length);
1587 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001589 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001590 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001591 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001592 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001593 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001594}
1595
Alexander Belopolsky40018472011-02-26 01:02:56 +00001596int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001597PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001598{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 PyObject *unicode;
1600 if (p_unicode == NULL) {
1601 PyErr_BadInternalCall();
1602 return -1;
1603 }
1604 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001605 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 {
1607 PyErr_BadInternalCall();
1608 return -1;
1609 }
1610 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001611}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001612
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001613static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001614unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001615{
1616 PyObject *result;
1617 assert(PyUnicode_IS_READY(*p_unicode));
1618 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1619 return 0;
1620 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1621 maxchar);
1622 if (result == NULL)
1623 return -1;
1624 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1625 PyUnicode_GET_LENGTH(*p_unicode));
1626 Py_DECREF(*p_unicode);
1627 *p_unicode = result;
1628 return 0;
1629}
1630
1631static int
1632unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1633 Py_UCS4 ch)
1634{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001635 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001636 if (unicode_widen(p_unicode, ch) < 0)
1637 return -1;
1638 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1639 PyUnicode_DATA(*p_unicode),
1640 (*pos)++, ch);
1641 return 0;
1642}
1643
Victor Stinnerc5166102012-02-22 13:55:02 +01001644/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1645 Return the length of the input string.
1646
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001647 WARNING: The function doesn't copy the terminating null character and
1648 doesn't check the maximum character (may write a latin1 character in an
1649 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001650static Py_ssize_t
1651unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1652{
1653 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1654 void *data = PyUnicode_DATA(unicode);
1655
1656 switch (kind) {
1657 case PyUnicode_1BYTE_KIND: {
1658 Py_ssize_t len = strlen(str);
1659 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001660 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001661 return len;
1662 }
1663 case PyUnicode_2BYTE_KIND: {
1664 Py_UCS2 *start = (Py_UCS2 *)data + index;
1665 Py_UCS2 *ucs2 = start;
1666 assert(index <= PyUnicode_GET_LENGTH(unicode));
1667
1668 for (; *str; ++ucs2, ++str)
1669 *ucs2 = (Py_UCS2)*str;
1670
1671 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1672 return ucs2 - start;
1673 }
1674 default: {
1675 Py_UCS4 *start = (Py_UCS4 *)data + index;
1676 Py_UCS4 *ucs4 = start;
1677 assert(kind == PyUnicode_4BYTE_KIND);
1678 assert(index <= PyUnicode_GET_LENGTH(unicode));
1679
1680 for (; *str; ++ucs4, ++str)
1681 *ucs4 = (Py_UCS4)*str;
1682
1683 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1684 return ucs4 - start;
1685 }
1686 }
1687}
1688
1689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690static PyObject*
1691get_latin1_char(unsigned char ch)
1692{
Victor Stinnera464fc12011-10-02 20:39:30 +02001693 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001695 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696 if (!unicode)
1697 return NULL;
1698 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001699 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 unicode_latin1[ch] = unicode;
1701 }
1702 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001703 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704}
1705
Alexander Belopolsky40018472011-02-26 01:02:56 +00001706PyObject *
1707PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001709 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 Py_UCS4 maxchar = 0;
1711 Py_ssize_t num_surrogates;
1712
1713 if (u == NULL)
1714 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001716 /* If the Unicode data is known at construction time, we can apply
1717 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 /* Optimization for empty strings */
1720 if (size == 0 && unicode_empty != NULL) {
1721 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001722 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001723 }
Tim Petersced69f82003-09-16 20:30:58 +00001724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 /* Single character Unicode objects in the Latin-1 range are
1726 shared when using this constructor */
1727 if (size == 1 && *u < 256)
1728 return get_latin1_char((unsigned char)*u);
1729
1730 /* If not empty and not single character, copy the Unicode data
1731 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001732 if (find_maxchar_surrogates(u, u + size,
1733 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 return NULL;
1735
Victor Stinner8faf8212011-12-08 22:14:11 +01001736 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 if (!unicode)
1738 return NULL;
1739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 switch (PyUnicode_KIND(unicode)) {
1741 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001742 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1744 break;
1745 case PyUnicode_2BYTE_KIND:
1746#if Py_UNICODE_SIZE == 2
1747 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1748#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001749 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1751#endif
1752 break;
1753 case PyUnicode_4BYTE_KIND:
1754#if SIZEOF_WCHAR_T == 2
1755 /* This is the only case which has to process surrogates, thus
1756 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001757 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758#else
1759 assert(num_surrogates == 0);
1760 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1761#endif
1762 break;
1763 default:
1764 assert(0 && "Impossible state");
1765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001767 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768}
1769
Alexander Belopolsky40018472011-02-26 01:02:56 +00001770PyObject *
1771PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001773 if (size < 0) {
1774 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001776 return NULL;
1777 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001778 if (u != NULL)
1779 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1780 else
1781 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001782}
1783
Alexander Belopolsky40018472011-02-26 01:02:56 +00001784PyObject *
1785PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001786{
1787 size_t size = strlen(u);
1788 if (size > PY_SSIZE_T_MAX) {
1789 PyErr_SetString(PyExc_OverflowError, "input too long");
1790 return NULL;
1791 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001792 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001793}
1794
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001795PyObject *
1796_PyUnicode_FromId(_Py_Identifier *id)
1797{
1798 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001799 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1800 strlen(id->string),
1801 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001802 if (!id->object)
1803 return NULL;
1804 PyUnicode_InternInPlace(&id->object);
1805 assert(!id->next);
1806 id->next = static_strings;
1807 static_strings = id;
1808 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001809 return id->object;
1810}
1811
1812void
1813_PyUnicode_ClearStaticStrings()
1814{
1815 _Py_Identifier *i;
1816 for (i = static_strings; i; i = i->next) {
1817 Py_DECREF(i->object);
1818 i->object = NULL;
1819 i->next = NULL;
1820 }
1821}
1822
Benjamin Peterson0df54292012-03-26 14:50:32 -04001823/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001824
Victor Stinnere57b1c02011-09-28 22:20:48 +02001825static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001826unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001827{
Victor Stinner785938e2011-12-11 20:09:03 +01001828 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001829 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001830#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001831 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001832#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001833 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001834 }
Victor Stinner785938e2011-12-11 20:09:03 +01001835 unicode = PyUnicode_New(size, 127);
1836 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001837 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001838 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1839 assert(_PyUnicode_CheckConsistency(unicode, 1));
1840 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001841}
1842
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001843static Py_UCS4
1844kind_maxchar_limit(unsigned int kind)
1845{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001846 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001847 case PyUnicode_1BYTE_KIND:
1848 return 0x80;
1849 case PyUnicode_2BYTE_KIND:
1850 return 0x100;
1851 case PyUnicode_4BYTE_KIND:
1852 return 0x10000;
1853 default:
1854 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001855 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001856 }
1857}
1858
Victor Stinner702c7342011-10-05 13:50:52 +02001859static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001860_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001863 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001864
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001865 if (size == 0) {
1866 Py_INCREF(unicode_empty);
1867 return unicode_empty;
1868 }
1869 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001870 if (size == 1)
1871 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001872
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001873 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001874 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 if (!res)
1876 return NULL;
1877 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001878 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001880}
1881
Victor Stinnere57b1c02011-09-28 22:20:48 +02001882static PyObject*
1883_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884{
1885 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001886 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001887
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001888 if (size == 0) {
1889 Py_INCREF(unicode_empty);
1890 return unicode_empty;
1891 }
1892 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001893 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001894 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001895
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001896 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001897 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 if (!res)
1899 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001900 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001902 else {
1903 _PyUnicode_CONVERT_BYTES(
1904 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1905 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001906 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 return res;
1908}
1909
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910static PyObject*
1911_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912{
1913 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001915
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916 if (size == 0) {
1917 Py_INCREF(unicode_empty);
1918 return unicode_empty;
1919 }
1920 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001921 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001922 return get_latin1_char((unsigned char)u[0]);
1923
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001924 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001925 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 if (!res)
1927 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001928 if (max_char < 256)
1929 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1930 PyUnicode_1BYTE_DATA(res));
1931 else if (max_char < 0x10000)
1932 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1933 PyUnicode_2BYTE_DATA(res));
1934 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001936 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 return res;
1938}
1939
1940PyObject*
1941PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1942{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001943 if (size < 0) {
1944 PyErr_SetString(PyExc_ValueError, "size must be positive");
1945 return NULL;
1946 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001947 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001949 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001951 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001953 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001954 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001955 PyErr_SetString(PyExc_SystemError, "invalid kind");
1956 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958}
1959
Victor Stinnerece58de2012-04-23 23:36:38 +02001960Py_UCS4
1961_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
1962{
1963 enum PyUnicode_Kind kind;
1964 void *startptr, *endptr;
1965
1966 assert(PyUnicode_IS_READY(unicode));
1967 assert(0 <= start);
1968 assert(end <= PyUnicode_GET_LENGTH(unicode));
1969 assert(start <= end);
1970
1971 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
1972 return PyUnicode_MAX_CHAR_VALUE(unicode);
1973
1974 if (start == end)
1975 return 127;
1976
1977 kind = PyUnicode_KIND(unicode);
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04001978 startptr = PyUnicode_DATA(unicode) + start * kind;
1979 endptr = startptr + end * kind;
1980 switch(kind) {
1981 case PyUnicode_1BYTE_KIND:
1982 return ucs1lib_find_max_char(startptr, endptr);
1983 case PyUnicode_2BYTE_KIND:
1984 return ucs2lib_find_max_char(startptr, endptr);
1985 case PyUnicode_4BYTE_KIND:
1986 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02001987 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04001988 assert(0);
1989 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02001990 }
1991}
1992
Victor Stinner25a4b292011-10-06 12:31:55 +02001993/* Ensure that a string uses the most efficient storage, if it is not the
1994 case: create a new string with of the right kind. Write NULL into *p_unicode
1995 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001996static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001997unicode_adjust_maxchar(PyObject **p_unicode)
1998{
1999 PyObject *unicode, *copy;
2000 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002001 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002002 unsigned int kind;
2003
2004 assert(p_unicode != NULL);
2005 unicode = *p_unicode;
2006 assert(PyUnicode_IS_READY(unicode));
2007 if (PyUnicode_IS_ASCII(unicode))
2008 return;
2009
2010 len = PyUnicode_GET_LENGTH(unicode);
2011 kind = PyUnicode_KIND(unicode);
2012 if (kind == PyUnicode_1BYTE_KIND) {
2013 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002014 max_char = ucs1lib_find_max_char(u, u + len);
2015 if (max_char >= 128)
2016 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002017 }
2018 else if (kind == PyUnicode_2BYTE_KIND) {
2019 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002020 max_char = ucs2lib_find_max_char(u, u + len);
2021 if (max_char >= 256)
2022 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002023 }
2024 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002025 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002026 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002027 max_char = ucs4lib_find_max_char(u, u + len);
2028 if (max_char >= 0x10000)
2029 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002030 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002031 copy = PyUnicode_New(len, max_char);
2032 copy_characters(copy, 0, unicode, 0, len);
2033 Py_DECREF(unicode);
2034 *p_unicode = copy;
2035}
2036
Victor Stinner034f6cf2011-09-30 02:26:44 +02002037PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002038_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002039{
Victor Stinner87af4f22011-11-21 23:03:47 +01002040 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002041 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002042
Victor Stinner034f6cf2011-09-30 02:26:44 +02002043 if (!PyUnicode_Check(unicode)) {
2044 PyErr_BadInternalCall();
2045 return NULL;
2046 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002047 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002048 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002049
Victor Stinner87af4f22011-11-21 23:03:47 +01002050 length = PyUnicode_GET_LENGTH(unicode);
2051 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002052 if (!copy)
2053 return NULL;
2054 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2055
Victor Stinner87af4f22011-11-21 23:03:47 +01002056 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2057 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002058 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002059 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002060}
2061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062
Victor Stinnerbc603d12011-10-02 01:00:40 +02002063/* Widen Unicode objects to larger buffers. Don't write terminating null
2064 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065
2066void*
2067_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2068{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002069 Py_ssize_t len;
2070 void *result;
2071 unsigned int skind;
2072
Benjamin Petersonbac79492012-01-14 13:34:47 -05002073 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002074 return NULL;
2075
2076 len = PyUnicode_GET_LENGTH(s);
2077 skind = PyUnicode_KIND(s);
2078 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002079 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 return NULL;
2081 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002082 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002083 case PyUnicode_2BYTE_KIND:
2084 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2085 if (!result)
2086 return PyErr_NoMemory();
2087 assert(skind == PyUnicode_1BYTE_KIND);
2088 _PyUnicode_CONVERT_BYTES(
2089 Py_UCS1, Py_UCS2,
2090 PyUnicode_1BYTE_DATA(s),
2091 PyUnicode_1BYTE_DATA(s) + len,
2092 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002094 case PyUnicode_4BYTE_KIND:
2095 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2096 if (!result)
2097 return PyErr_NoMemory();
2098 if (skind == PyUnicode_2BYTE_KIND) {
2099 _PyUnicode_CONVERT_BYTES(
2100 Py_UCS2, Py_UCS4,
2101 PyUnicode_2BYTE_DATA(s),
2102 PyUnicode_2BYTE_DATA(s) + len,
2103 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002105 else {
2106 assert(skind == PyUnicode_1BYTE_KIND);
2107 _PyUnicode_CONVERT_BYTES(
2108 Py_UCS1, Py_UCS4,
2109 PyUnicode_1BYTE_DATA(s),
2110 PyUnicode_1BYTE_DATA(s) + len,
2111 result);
2112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002114 default:
2115 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002116 }
Victor Stinner01698042011-10-04 00:04:26 +02002117 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002118 return NULL;
2119}
2120
2121static Py_UCS4*
2122as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2123 int copy_null)
2124{
2125 int kind;
2126 void *data;
2127 Py_ssize_t len, targetlen;
2128 if (PyUnicode_READY(string) == -1)
2129 return NULL;
2130 kind = PyUnicode_KIND(string);
2131 data = PyUnicode_DATA(string);
2132 len = PyUnicode_GET_LENGTH(string);
2133 targetlen = len;
2134 if (copy_null)
2135 targetlen++;
2136 if (!target) {
2137 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2138 PyErr_NoMemory();
2139 return NULL;
2140 }
2141 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2142 if (!target) {
2143 PyErr_NoMemory();
2144 return NULL;
2145 }
2146 }
2147 else {
2148 if (targetsize < targetlen) {
2149 PyErr_Format(PyExc_SystemError,
2150 "string is longer than the buffer");
2151 if (copy_null && 0 < targetsize)
2152 target[0] = 0;
2153 return NULL;
2154 }
2155 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002156 if (kind == PyUnicode_1BYTE_KIND) {
2157 Py_UCS1 *start = (Py_UCS1 *) data;
2158 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002160 else if (kind == PyUnicode_2BYTE_KIND) {
2161 Py_UCS2 *start = (Py_UCS2 *) data;
2162 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2163 }
2164 else {
2165 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002167 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 if (copy_null)
2169 target[len] = 0;
2170 return target;
2171}
2172
2173Py_UCS4*
2174PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2175 int copy_null)
2176{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002177 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 PyErr_BadInternalCall();
2179 return NULL;
2180 }
2181 return as_ucs4(string, target, targetsize, copy_null);
2182}
2183
2184Py_UCS4*
2185PyUnicode_AsUCS4Copy(PyObject *string)
2186{
2187 return as_ucs4(string, NULL, 0, 1);
2188}
2189
2190#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002191
Alexander Belopolsky40018472011-02-26 01:02:56 +00002192PyObject *
2193PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002196 if (size == 0) {
2197 Py_INCREF(unicode_empty);
2198 return unicode_empty;
2199 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002200 PyErr_BadInternalCall();
2201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 }
2203
Martin v. Löwis790465f2008-04-05 20:41:37 +00002204 if (size == -1) {
2205 size = wcslen(w);
2206 }
2207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209}
2210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002212
Walter Dörwald346737f2007-05-31 10:44:43 +00002213static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002214makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2215 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002216{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 *fmt++ = '%';
2218 if (width) {
2219 if (zeropad)
2220 *fmt++ = '0';
2221 fmt += sprintf(fmt, "%d", width);
2222 }
2223 if (precision)
2224 fmt += sprintf(fmt, ".%d", precision);
2225 if (longflag)
2226 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002227 else if (longlongflag) {
2228 /* longlongflag should only ever be nonzero on machines with
2229 HAVE_LONG_LONG defined */
2230#ifdef HAVE_LONG_LONG
2231 char *f = PY_FORMAT_LONG_LONG;
2232 while (*f)
2233 *fmt++ = *f++;
2234#else
2235 /* we shouldn't ever get here */
2236 assert(0);
2237 *fmt++ = 'l';
2238#endif
2239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 else if (size_tflag) {
2241 char *f = PY_FORMAT_SIZE_T;
2242 while (*f)
2243 *fmt++ = *f++;
2244 }
2245 *fmt++ = c;
2246 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002247}
2248
Victor Stinner96865452011-03-01 23:44:09 +00002249/* helper for PyUnicode_FromFormatV() */
2250
2251static const char*
2252parse_format_flags(const char *f,
2253 int *p_width, int *p_precision,
2254 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2255{
2256 int width, precision, longflag, longlongflag, size_tflag;
2257
2258 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2259 f++;
2260 width = 0;
2261 while (Py_ISDIGIT((unsigned)*f))
2262 width = (width*10) + *f++ - '0';
2263 precision = 0;
2264 if (*f == '.') {
2265 f++;
2266 while (Py_ISDIGIT((unsigned)*f))
2267 precision = (precision*10) + *f++ - '0';
2268 if (*f == '%') {
2269 /* "%.3%s" => f points to "3" */
2270 f--;
2271 }
2272 }
2273 if (*f == '\0') {
2274 /* bogus format "%.1" => go backward, f points to "1" */
2275 f--;
2276 }
2277 if (p_width != NULL)
2278 *p_width = width;
2279 if (p_precision != NULL)
2280 *p_precision = precision;
2281
2282 /* Handle %ld, %lu, %lld and %llu. */
2283 longflag = 0;
2284 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002285 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002286
2287 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002288 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002289 longflag = 1;
2290 ++f;
2291 }
2292#ifdef HAVE_LONG_LONG
2293 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002294 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002295 longlongflag = 1;
2296 f += 2;
2297 }
2298#endif
2299 }
2300 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002301 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002302 size_tflag = 1;
2303 ++f;
2304 }
2305 if (p_longflag != NULL)
2306 *p_longflag = longflag;
2307 if (p_longlongflag != NULL)
2308 *p_longlongflag = longlongflag;
2309 if (p_size_tflag != NULL)
2310 *p_size_tflag = size_tflag;
2311 return f;
2312}
2313
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002314/* maximum number of characters required for output of %ld. 21 characters
2315 allows for 64-bit integers (in decimal) and an optional sign. */
2316#define MAX_LONG_CHARS 21
2317/* maximum number of characters required for output of %lld.
2318 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2319 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2320#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2321
Walter Dörwaldd2034312007-05-18 16:29:38 +00002322PyObject *
2323PyUnicode_FromFormatV(const char *format, va_list vargs)
2324{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002325 va_list count;
2326 Py_ssize_t callcount = 0;
2327 PyObject **callresults = NULL;
2328 PyObject **callresult = NULL;
2329 Py_ssize_t n = 0;
2330 int width = 0;
2331 int precision = 0;
2332 int zeropad;
2333 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002334 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002336 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2338 Py_UCS4 argmaxchar;
2339 Py_ssize_t numbersize = 0;
2340 char *numberresults = NULL;
2341 char *numberresult = NULL;
2342 Py_ssize_t i;
2343 int kind;
2344 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002345
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002346 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002347 /* step 1: count the number of %S/%R/%A/%s format specifications
2348 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2349 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002350 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002351 * also estimate a upper bound for all the number formats in the string,
2352 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002353 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002354 for (f = format; *f; f++) {
2355 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002356 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2358 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2359 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2360 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002363#ifdef HAVE_LONG_LONG
2364 if (longlongflag) {
2365 if (width < MAX_LONG_LONG_CHARS)
2366 width = MAX_LONG_LONG_CHARS;
2367 }
2368 else
2369#endif
2370 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2371 including sign. Decimal takes the most space. This
2372 isn't enough for octal. If a width is specified we
2373 need more (which we allocate later). */
2374 if (width < MAX_LONG_CHARS)
2375 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376
2377 /* account for the size + '\0' to separate numbers
2378 inside of the numberresults buffer */
2379 numbersize += (width + 1);
2380 }
2381 }
2382 else if ((unsigned char)*f > 127) {
2383 PyErr_Format(PyExc_ValueError,
2384 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2385 "string, got a non-ASCII byte: 0x%02x",
2386 (unsigned char)*f);
2387 return NULL;
2388 }
2389 }
2390 /* step 2: allocate memory for the results of
2391 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2392 if (callcount) {
2393 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2394 if (!callresults) {
2395 PyErr_NoMemory();
2396 return NULL;
2397 }
2398 callresult = callresults;
2399 }
2400 /* step 2.5: allocate memory for the results of formating numbers */
2401 if (numbersize) {
2402 numberresults = PyObject_Malloc(numbersize);
2403 if (!numberresults) {
2404 PyErr_NoMemory();
2405 goto fail;
2406 }
2407 numberresult = numberresults;
2408 }
2409
2410 /* step 3: format numbers and figure out how large a buffer we need */
2411 for (f = format; *f; f++) {
2412 if (*f == '%') {
2413 const char* p;
2414 int longflag;
2415 int longlongflag;
2416 int size_tflag;
2417 int numprinted;
2418
2419 p = f;
2420 zeropad = (f[1] == '0');
2421 f = parse_format_flags(f, &width, &precision,
2422 &longflag, &longlongflag, &size_tflag);
2423 switch (*f) {
2424 case 'c':
2425 {
2426 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002427 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 n++;
2429 break;
2430 }
2431 case '%':
2432 n++;
2433 break;
2434 case 'i':
2435 case 'd':
2436 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2437 width, precision, *f);
2438 if (longflag)
2439 numprinted = sprintf(numberresult, fmt,
2440 va_arg(count, long));
2441#ifdef HAVE_LONG_LONG
2442 else if (longlongflag)
2443 numprinted = sprintf(numberresult, fmt,
2444 va_arg(count, PY_LONG_LONG));
2445#endif
2446 else if (size_tflag)
2447 numprinted = sprintf(numberresult, fmt,
2448 va_arg(count, Py_ssize_t));
2449 else
2450 numprinted = sprintf(numberresult, fmt,
2451 va_arg(count, int));
2452 n += numprinted;
2453 /* advance by +1 to skip over the '\0' */
2454 numberresult += (numprinted + 1);
2455 assert(*(numberresult - 1) == '\0');
2456 assert(*(numberresult - 2) != '\0');
2457 assert(numprinted >= 0);
2458 assert(numberresult <= numberresults + numbersize);
2459 break;
2460 case 'u':
2461 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2462 width, precision, 'u');
2463 if (longflag)
2464 numprinted = sprintf(numberresult, fmt,
2465 va_arg(count, unsigned long));
2466#ifdef HAVE_LONG_LONG
2467 else if (longlongflag)
2468 numprinted = sprintf(numberresult, fmt,
2469 va_arg(count, unsigned PY_LONG_LONG));
2470#endif
2471 else if (size_tflag)
2472 numprinted = sprintf(numberresult, fmt,
2473 va_arg(count, size_t));
2474 else
2475 numprinted = sprintf(numberresult, fmt,
2476 va_arg(count, unsigned int));
2477 n += numprinted;
2478 numberresult += (numprinted + 1);
2479 assert(*(numberresult - 1) == '\0');
2480 assert(*(numberresult - 2) != '\0');
2481 assert(numprinted >= 0);
2482 assert(numberresult <= numberresults + numbersize);
2483 break;
2484 case 'x':
2485 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2486 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2487 n += numprinted;
2488 numberresult += (numprinted + 1);
2489 assert(*(numberresult - 1) == '\0');
2490 assert(*(numberresult - 2) != '\0');
2491 assert(numprinted >= 0);
2492 assert(numberresult <= numberresults + numbersize);
2493 break;
2494 case 'p':
2495 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2496 /* %p is ill-defined: ensure leading 0x. */
2497 if (numberresult[1] == 'X')
2498 numberresult[1] = 'x';
2499 else if (numberresult[1] != 'x') {
2500 memmove(numberresult + 2, numberresult,
2501 strlen(numberresult) + 1);
2502 numberresult[0] = '0';
2503 numberresult[1] = 'x';
2504 numprinted += 2;
2505 }
2506 n += numprinted;
2507 numberresult += (numprinted + 1);
2508 assert(*(numberresult - 1) == '\0');
2509 assert(*(numberresult - 2) != '\0');
2510 assert(numprinted >= 0);
2511 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002512 break;
2513 case 's':
2514 {
2515 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002516 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002517 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002518 if (!str)
2519 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 /* since PyUnicode_DecodeUTF8 returns already flexible
2521 unicode objects, there is no need to call ready on them */
2522 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002523 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002525 /* Remember the str and switch to the next slot */
2526 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 break;
2528 }
2529 case 'U':
2530 {
2531 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002532 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002533 if (PyUnicode_READY(obj) == -1)
2534 goto fail;
2535 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002536 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 break;
2539 }
2540 case 'V':
2541 {
2542 PyObject *obj = va_arg(count, PyObject *);
2543 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002544 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002545 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002546 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002547 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 if (PyUnicode_READY(obj) == -1)
2549 goto fail;
2550 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002551 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002553 *callresult++ = NULL;
2554 }
2555 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002556 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002557 if (!str_obj)
2558 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002559 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002560 Py_DECREF(str_obj);
2561 goto fail;
2562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002564 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002566 *callresult++ = str_obj;
2567 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 break;
2569 }
2570 case 'S':
2571 {
2572 PyObject *obj = va_arg(count, PyObject *);
2573 PyObject *str;
2574 assert(obj);
2575 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002576 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002578 if (PyUnicode_READY(str) == -1) {
2579 Py_DECREF(str);
2580 goto fail;
2581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002583 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002585 /* Remember the str and switch to the next slot */
2586 *callresult++ = str;
2587 break;
2588 }
2589 case 'R':
2590 {
2591 PyObject *obj = va_arg(count, PyObject *);
2592 PyObject *repr;
2593 assert(obj);
2594 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002595 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002596 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002597 if (PyUnicode_READY(repr) == -1) {
2598 Py_DECREF(repr);
2599 goto fail;
2600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002602 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 /* Remember the repr and switch to the next slot */
2605 *callresult++ = repr;
2606 break;
2607 }
2608 case 'A':
2609 {
2610 PyObject *obj = va_arg(count, PyObject *);
2611 PyObject *ascii;
2612 assert(obj);
2613 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002614 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002616 if (PyUnicode_READY(ascii) == -1) {
2617 Py_DECREF(ascii);
2618 goto fail;
2619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002621 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002623 /* Remember the repr and switch to the next slot */
2624 *callresult++ = ascii;
2625 break;
2626 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 default:
2628 /* if we stumble upon an unknown
2629 formatting code, copy the rest of
2630 the format string to the output
2631 string. (we cannot just skip the
2632 code, since there's no way to know
2633 what's in the argument list) */
2634 n += strlen(p);
2635 goto expand;
2636 }
2637 } else
2638 n++;
2639 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002640 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 we don't have to resize the string.
2644 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002645 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 if (!string)
2647 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002648 kind = PyUnicode_KIND(string);
2649 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002653 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002655 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002656
2657 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2659 /* checking for == because the last argument could be a empty
2660 string, which causes i to point to end, the assert at the end of
2661 the loop */
2662 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002663
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 switch (*f) {
2665 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002666 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 const int ordinal = va_arg(vargs, int);
2668 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002669 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002670 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002671 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002676 {
2677 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 /* unused, since we already have the result */
2679 if (*f == 'p')
2680 (void) va_arg(vargs, void *);
2681 else
2682 (void) va_arg(vargs, int);
2683 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002684 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002686 i += written;
2687 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002688 assert(*numberresult == '\0');
2689 numberresult++;
2690 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002691 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002692 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002693 case 's':
2694 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002695 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002696 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002697 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 size = PyUnicode_GET_LENGTH(*callresult);
2699 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002700 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002702 /* We're done with the unicode()/repr() => forget it */
2703 Py_DECREF(*callresult);
2704 /* switch to next unicode()/repr() result */
2705 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 break;
2707 }
2708 case 'U':
2709 {
2710 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 Py_ssize_t size;
2712 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2713 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002714 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 break;
2717 }
2718 case 'V':
2719 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002722 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002723 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 size = PyUnicode_GET_LENGTH(obj);
2725 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002726 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 size = PyUnicode_GET_LENGTH(*callresult);
2730 assert(PyUnicode_KIND(*callresult) <=
2731 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002732 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002734 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002736 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 break;
2738 }
2739 case 'S':
2740 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002741 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002742 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002743 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002744 /* unused, since we already have the result */
2745 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002747 copy_characters(string, i, *callresult, 0, size);
2748 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 /* We're done with the unicode()/repr() => forget it */
2750 Py_DECREF(*callresult);
2751 /* switch to next unicode()/repr() result */
2752 ++callresult;
2753 break;
2754 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002755 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002757 break;
2758 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002759 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002761 goto end;
2762 }
Victor Stinner1205f272010-09-11 00:54:47 +00002763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 else {
2765 assert(i < PyUnicode_GET_LENGTH(string));
2766 PyUnicode_WRITE(kind, data, i++, *f);
2767 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002770
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 if (callresults)
2773 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 if (numberresults)
2775 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002776 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002777 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 if (callresults) {
2779 PyObject **callresult2 = callresults;
2780 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002781 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 ++callresult2;
2783 }
2784 PyObject_Free(callresults);
2785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 if (numberresults)
2787 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002789}
2790
Walter Dörwaldd2034312007-05-18 16:29:38 +00002791PyObject *
2792PyUnicode_FromFormat(const char *format, ...)
2793{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002794 PyObject* ret;
2795 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002796
2797#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002798 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002799#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002800 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002801#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002802 ret = PyUnicode_FromFormatV(format, vargs);
2803 va_end(vargs);
2804 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002805}
2806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807#ifdef HAVE_WCHAR_H
2808
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2810 convert a Unicode object to a wide character string.
2811
Victor Stinnerd88d9832011-09-06 02:00:05 +02002812 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002813 character) required to convert the unicode object. Ignore size argument.
2814
Victor Stinnerd88d9832011-09-06 02:00:05 +02002815 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002817 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002818static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002819unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002820 wchar_t *w,
2821 Py_ssize_t size)
2822{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002823 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 const wchar_t *wstr;
2825
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002826 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002827 if (wstr == NULL)
2828 return -1;
2829
Victor Stinner5593d8a2010-10-02 11:11:27 +00002830 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002831 if (size > res)
2832 size = res + 1;
2833 else
2834 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002835 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002836 return res;
2837 }
2838 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002840}
2841
2842Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002843PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002844 wchar_t *w,
2845 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846{
2847 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002848 PyErr_BadInternalCall();
2849 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002851 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852}
2853
Victor Stinner137c34c2010-09-29 10:25:54 +00002854wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002855PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002856 Py_ssize_t *size)
2857{
2858 wchar_t* buffer;
2859 Py_ssize_t buflen;
2860
2861 if (unicode == NULL) {
2862 PyErr_BadInternalCall();
2863 return NULL;
2864 }
2865
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002866 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002867 if (buflen == -1)
2868 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002869 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002870 PyErr_NoMemory();
2871 return NULL;
2872 }
2873
Victor Stinner137c34c2010-09-29 10:25:54 +00002874 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2875 if (buffer == NULL) {
2876 PyErr_NoMemory();
2877 return NULL;
2878 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002879 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 if (buflen == -1)
2881 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002882 if (size != NULL)
2883 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002884 return buffer;
2885}
2886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002887#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888
Alexander Belopolsky40018472011-02-26 01:02:56 +00002889PyObject *
2890PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002892 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002893 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002894 PyErr_SetString(PyExc_ValueError,
2895 "chr() arg not in range(0x110000)");
2896 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002897 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 if (ordinal < 256)
2900 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002902 v = PyUnicode_New(1, ordinal);
2903 if (v == NULL)
2904 return NULL;
2905 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002906 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002908}
2909
Alexander Belopolsky40018472011-02-26 01:02:56 +00002910PyObject *
2911PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002913 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002914 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002915 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002916 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002917 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 Py_INCREF(obj);
2919 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002920 }
2921 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002922 /* For a Unicode subtype that's not a Unicode object,
2923 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002924 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002925 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002926 PyErr_Format(PyExc_TypeError,
2927 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002928 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002929 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002930}
2931
Alexander Belopolsky40018472011-02-26 01:02:56 +00002932PyObject *
2933PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002934 const char *encoding,
2935 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002936{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002937 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002938 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002939
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 PyErr_BadInternalCall();
2942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002944
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002945 /* Decoding bytes objects is the most common case and should be fast */
2946 if (PyBytes_Check(obj)) {
2947 if (PyBytes_GET_SIZE(obj) == 0) {
2948 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002949 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002950 }
2951 else {
2952 v = PyUnicode_Decode(
2953 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2954 encoding, errors);
2955 }
2956 return v;
2957 }
2958
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002959 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002960 PyErr_SetString(PyExc_TypeError,
2961 "decoding str is not supported");
2962 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002963 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002964
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002965 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2966 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2967 PyErr_Format(PyExc_TypeError,
2968 "coercing to str: need bytes, bytearray "
2969 "or buffer-like object, %.80s found",
2970 Py_TYPE(obj)->tp_name);
2971 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002972 }
Tim Petersced69f82003-09-16 20:30:58 +00002973
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002974 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002976 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 }
Tim Petersced69f82003-09-16 20:30:58 +00002978 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002979 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002980
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002981 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002982 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983}
2984
Victor Stinner600d3be2010-06-10 12:00:55 +00002985/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002986 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2987 1 on success. */
2988static int
2989normalize_encoding(const char *encoding,
2990 char *lower,
2991 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002993 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002994 char *l;
2995 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002997 if (encoding == NULL) {
2998 strcpy(lower, "utf-8");
2999 return 1;
3000 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003001 e = encoding;
3002 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003003 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003004 while (*e) {
3005 if (l == l_end)
3006 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003007 if (Py_ISUPPER(*e)) {
3008 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003009 }
3010 else if (*e == '_') {
3011 *l++ = '-';
3012 e++;
3013 }
3014 else {
3015 *l++ = *e++;
3016 }
3017 }
3018 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003019 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003020}
3021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003024 Py_ssize_t size,
3025 const char *encoding,
3026 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003027{
3028 PyObject *buffer = NULL, *unicode;
3029 Py_buffer info;
3030 char lower[11]; /* Enough for any encoding shortcut */
3031
Fred Drakee4315f52000-05-09 19:53:39 +00003032 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003033 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003034 if ((strcmp(lower, "utf-8") == 0) ||
3035 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003036 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003037 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003038 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003039 (strcmp(lower, "iso-8859-1") == 0))
3040 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003041#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003042 else if (strcmp(lower, "mbcs") == 0)
3043 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003044#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003045 else if (strcmp(lower, "ascii") == 0)
3046 return PyUnicode_DecodeASCII(s, size, errors);
3047 else if (strcmp(lower, "utf-16") == 0)
3048 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3049 else if (strcmp(lower, "utf-32") == 0)
3050 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052
3053 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003054 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003055 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003056 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003057 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 if (buffer == NULL)
3059 goto onError;
3060 unicode = PyCodec_Decode(buffer, encoding, errors);
3061 if (unicode == NULL)
3062 goto onError;
3063 if (!PyUnicode_Check(unicode)) {
3064 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003065 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003066 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 Py_DECREF(unicode);
3068 goto onError;
3069 }
3070 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003071 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003072
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 Py_XDECREF(buffer);
3075 return NULL;
3076}
3077
Alexander Belopolsky40018472011-02-26 01:02:56 +00003078PyObject *
3079PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003080 const char *encoding,
3081 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003082{
3083 PyObject *v;
3084
3085 if (!PyUnicode_Check(unicode)) {
3086 PyErr_BadArgument();
3087 goto onError;
3088 }
3089
3090 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003092
3093 /* Decode via the codec registry */
3094 v = PyCodec_Decode(unicode, encoding, errors);
3095 if (v == NULL)
3096 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003097 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003098
Benjamin Peterson29060642009-01-31 22:14:21 +00003099 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003100 return NULL;
3101}
3102
Alexander Belopolsky40018472011-02-26 01:02:56 +00003103PyObject *
3104PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003105 const char *encoding,
3106 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003107{
3108 PyObject *v;
3109
3110 if (!PyUnicode_Check(unicode)) {
3111 PyErr_BadArgument();
3112 goto onError;
3113 }
3114
3115 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003116 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003117
3118 /* Decode via the codec registry */
3119 v = PyCodec_Decode(unicode, encoding, errors);
3120 if (v == NULL)
3121 goto onError;
3122 if (!PyUnicode_Check(v)) {
3123 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003124 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003125 Py_TYPE(v)->tp_name);
3126 Py_DECREF(v);
3127 goto onError;
3128 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003129 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003130
Benjamin Peterson29060642009-01-31 22:14:21 +00003131 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003132 return NULL;
3133}
3134
Alexander Belopolsky40018472011-02-26 01:02:56 +00003135PyObject *
3136PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003137 Py_ssize_t size,
3138 const char *encoding,
3139 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140{
3141 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003142
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 unicode = PyUnicode_FromUnicode(s, size);
3144 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3147 Py_DECREF(unicode);
3148 return v;
3149}
3150
Alexander Belopolsky40018472011-02-26 01:02:56 +00003151PyObject *
3152PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003153 const char *encoding,
3154 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003155{
3156 PyObject *v;
3157
3158 if (!PyUnicode_Check(unicode)) {
3159 PyErr_BadArgument();
3160 goto onError;
3161 }
3162
3163 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003165
3166 /* Encode via the codec registry */
3167 v = PyCodec_Encode(unicode, encoding, errors);
3168 if (v == NULL)
3169 goto onError;
3170 return v;
3171
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003173 return NULL;
3174}
3175
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003176static size_t
3177wcstombs_errorpos(const wchar_t *wstr)
3178{
3179 size_t len;
3180#if SIZEOF_WCHAR_T == 2
3181 wchar_t buf[3];
3182#else
3183 wchar_t buf[2];
3184#endif
3185 char outbuf[MB_LEN_MAX];
3186 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003187
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003188#if SIZEOF_WCHAR_T == 2
3189 buf[2] = 0;
3190#else
3191 buf[1] = 0;
3192#endif
3193 start = wstr;
3194 while (*wstr != L'\0')
3195 {
3196 previous = wstr;
3197#if SIZEOF_WCHAR_T == 2
3198 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3199 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3200 {
3201 buf[0] = wstr[0];
3202 buf[1] = wstr[1];
3203 wstr += 2;
3204 }
3205 else {
3206 buf[0] = *wstr;
3207 buf[1] = 0;
3208 wstr++;
3209 }
3210#else
3211 buf[0] = *wstr;
3212 wstr++;
3213#endif
3214 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003215 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003217 }
3218
3219 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003220 return 0;
3221}
3222
Victor Stinner1b579672011-12-17 05:47:23 +01003223static int
3224locale_error_handler(const char *errors, int *surrogateescape)
3225{
3226 if (errors == NULL) {
3227 *surrogateescape = 0;
3228 return 0;
3229 }
3230
3231 if (strcmp(errors, "strict") == 0) {
3232 *surrogateescape = 0;
3233 return 0;
3234 }
3235 if (strcmp(errors, "surrogateescape") == 0) {
3236 *surrogateescape = 1;
3237 return 0;
3238 }
3239 PyErr_Format(PyExc_ValueError,
3240 "only 'strict' and 'surrogateescape' error handlers "
3241 "are supported, not '%s'",
3242 errors);
3243 return -1;
3244}
3245
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003246PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003247PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003248{
3249 Py_ssize_t wlen, wlen2;
3250 wchar_t *wstr;
3251 PyObject *bytes = NULL;
3252 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003253 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003254 PyObject *exc;
3255 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003256 int surrogateescape;
3257
3258 if (locale_error_handler(errors, &surrogateescape) < 0)
3259 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003260
3261 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3262 if (wstr == NULL)
3263 return NULL;
3264
3265 wlen2 = wcslen(wstr);
3266 if (wlen2 != wlen) {
3267 PyMem_Free(wstr);
3268 PyErr_SetString(PyExc_TypeError, "embedded null character");
3269 return NULL;
3270 }
3271
3272 if (surrogateescape) {
3273 /* locale encoding with surrogateescape */
3274 char *str;
3275
3276 str = _Py_wchar2char(wstr, &error_pos);
3277 if (str == NULL) {
3278 if (error_pos == (size_t)-1) {
3279 PyErr_NoMemory();
3280 PyMem_Free(wstr);
3281 return NULL;
3282 }
3283 else {
3284 goto encode_error;
3285 }
3286 }
3287 PyMem_Free(wstr);
3288
3289 bytes = PyBytes_FromString(str);
3290 PyMem_Free(str);
3291 }
3292 else {
3293 size_t len, len2;
3294
3295 len = wcstombs(NULL, wstr, 0);
3296 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003297 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003298 goto encode_error;
3299 }
3300
3301 bytes = PyBytes_FromStringAndSize(NULL, len);
3302 if (bytes == NULL) {
3303 PyMem_Free(wstr);
3304 return NULL;
3305 }
3306
3307 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3308 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003309 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003310 goto encode_error;
3311 }
3312 PyMem_Free(wstr);
3313 }
3314 return bytes;
3315
3316encode_error:
3317 errmsg = strerror(errno);
3318 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003319
3320 if (error_pos == (size_t)-1)
3321 error_pos = wcstombs_errorpos(wstr);
3322
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003323 PyMem_Free(wstr);
3324 Py_XDECREF(bytes);
3325
Victor Stinner2f197072011-12-17 07:08:30 +01003326 if (errmsg != NULL) {
3327 size_t errlen;
3328 wstr = _Py_char2wchar(errmsg, &errlen);
3329 if (wstr != NULL) {
3330 reason = PyUnicode_FromWideChar(wstr, errlen);
3331 PyMem_Free(wstr);
3332 } else
3333 errmsg = NULL;
3334 }
3335 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003336 reason = PyUnicode_FromString(
3337 "wcstombs() encountered an unencodable "
3338 "wide character");
3339 if (reason == NULL)
3340 return NULL;
3341
3342 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3343 "locale", unicode,
3344 (Py_ssize_t)error_pos,
3345 (Py_ssize_t)(error_pos+1),
3346 reason);
3347 Py_DECREF(reason);
3348 if (exc != NULL) {
3349 PyCodec_StrictErrors(exc);
3350 Py_XDECREF(exc);
3351 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003352 return NULL;
3353}
3354
Victor Stinnerad158722010-10-27 00:25:46 +00003355PyObject *
3356PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003357{
Victor Stinner99b95382011-07-04 14:23:54 +02003358#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003359 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003360#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003361 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003362#else
Victor Stinner793b5312011-04-27 00:24:21 +02003363 PyInterpreterState *interp = PyThreadState_GET()->interp;
3364 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3365 cannot use it to encode and decode filenames before it is loaded. Load
3366 the Python codec requires to encode at least its own filename. Use the C
3367 version of the locale codec until the codec registry is initialized and
3368 the Python codec is loaded.
3369
3370 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3371 cannot only rely on it: check also interp->fscodec_initialized for
3372 subinterpreters. */
3373 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003374 return PyUnicode_AsEncodedString(unicode,
3375 Py_FileSystemDefaultEncoding,
3376 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003377 }
3378 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003379 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003380 }
Victor Stinnerad158722010-10-27 00:25:46 +00003381#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003382}
3383
Alexander Belopolsky40018472011-02-26 01:02:56 +00003384PyObject *
3385PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003386 const char *encoding,
3387 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388{
3389 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003390 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003391
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 if (!PyUnicode_Check(unicode)) {
3393 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003394 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 }
Fred Drakee4315f52000-05-09 19:53:39 +00003396
Fred Drakee4315f52000-05-09 19:53:39 +00003397 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003398 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003399 if ((strcmp(lower, "utf-8") == 0) ||
3400 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003401 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003402 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003403 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003404 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003405 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003406 }
Victor Stinner37296e82010-06-10 13:36:23 +00003407 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003408 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003409 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003410 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003411#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003412 else if (strcmp(lower, "mbcs") == 0)
3413 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003414#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003415 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003416 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418
3419 /* Encode via the codec registry */
3420 v = PyCodec_Encode(unicode, encoding, errors);
3421 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003422 return NULL;
3423
3424 /* The normal path */
3425 if (PyBytes_Check(v))
3426 return v;
3427
3428 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003429 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003430 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003431 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003432
3433 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3434 "encoder %s returned bytearray instead of bytes",
3435 encoding);
3436 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003437 Py_DECREF(v);
3438 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003439 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003440
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003441 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3442 Py_DECREF(v);
3443 return b;
3444 }
3445
3446 PyErr_Format(PyExc_TypeError,
3447 "encoder did not return a bytes object (type=%.400s)",
3448 Py_TYPE(v)->tp_name);
3449 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003450 return NULL;
3451}
3452
Alexander Belopolsky40018472011-02-26 01:02:56 +00003453PyObject *
3454PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003455 const char *encoding,
3456 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003457{
3458 PyObject *v;
3459
3460 if (!PyUnicode_Check(unicode)) {
3461 PyErr_BadArgument();
3462 goto onError;
3463 }
3464
3465 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003467
3468 /* Encode via the codec registry */
3469 v = PyCodec_Encode(unicode, encoding, errors);
3470 if (v == NULL)
3471 goto onError;
3472 if (!PyUnicode_Check(v)) {
3473 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003474 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003475 Py_TYPE(v)->tp_name);
3476 Py_DECREF(v);
3477 goto onError;
3478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003480
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 return NULL;
3483}
3484
Victor Stinner2f197072011-12-17 07:08:30 +01003485static size_t
3486mbstowcs_errorpos(const char *str, size_t len)
3487{
3488#ifdef HAVE_MBRTOWC
3489 const char *start = str;
3490 mbstate_t mbs;
3491 size_t converted;
3492 wchar_t ch;
3493
3494 memset(&mbs, 0, sizeof mbs);
3495 while (len)
3496 {
3497 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3498 if (converted == 0)
3499 /* Reached end of string */
3500 break;
3501 if (converted == (size_t)-1 || converted == (size_t)-2) {
3502 /* Conversion error or incomplete character */
3503 return str - start;
3504 }
3505 else {
3506 str += converted;
3507 len -= converted;
3508 }
3509 }
3510 /* failed to find the undecodable byte sequence */
3511 return 0;
3512#endif
3513 return 0;
3514}
3515
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003516PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003517PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003518 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003519{
3520 wchar_t smallbuf[256];
3521 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3522 wchar_t *wstr;
3523 size_t wlen, wlen2;
3524 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003525 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003526 size_t error_pos;
3527 char *errmsg;
3528 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003529
3530 if (locale_error_handler(errors, &surrogateescape) < 0)
3531 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003532
3533 if (str[len] != '\0' || len != strlen(str)) {
3534 PyErr_SetString(PyExc_TypeError, "embedded null character");
3535 return NULL;
3536 }
3537
3538 if (surrogateescape)
3539 {
3540 wstr = _Py_char2wchar(str, &wlen);
3541 if (wstr == NULL) {
3542 if (wlen == (size_t)-1)
3543 PyErr_NoMemory();
3544 else
3545 PyErr_SetFromErrno(PyExc_OSError);
3546 return NULL;
3547 }
3548
3549 unicode = PyUnicode_FromWideChar(wstr, wlen);
3550 PyMem_Free(wstr);
3551 }
3552 else {
3553#ifndef HAVE_BROKEN_MBSTOWCS
3554 wlen = mbstowcs(NULL, str, 0);
3555#else
3556 wlen = len;
3557#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003558 if (wlen == (size_t)-1)
3559 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003560 if (wlen+1 <= smallbuf_len) {
3561 wstr = smallbuf;
3562 }
3563 else {
3564 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3565 return PyErr_NoMemory();
3566
3567 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3568 if (!wstr)
3569 return PyErr_NoMemory();
3570 }
3571
3572 /* This shouldn't fail now */
3573 wlen2 = mbstowcs(wstr, str, wlen+1);
3574 if (wlen2 == (size_t)-1) {
3575 if (wstr != smallbuf)
3576 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003577 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003578 }
3579#ifdef HAVE_BROKEN_MBSTOWCS
3580 assert(wlen2 == wlen);
3581#endif
3582 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3583 if (wstr != smallbuf)
3584 PyMem_Free(wstr);
3585 }
3586 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003587
3588decode_error:
3589 errmsg = strerror(errno);
3590 assert(errmsg != NULL);
3591
3592 error_pos = mbstowcs_errorpos(str, len);
3593 if (errmsg != NULL) {
3594 size_t errlen;
3595 wstr = _Py_char2wchar(errmsg, &errlen);
3596 if (wstr != NULL) {
3597 reason = PyUnicode_FromWideChar(wstr, errlen);
3598 PyMem_Free(wstr);
3599 } else
3600 errmsg = NULL;
3601 }
3602 if (errmsg == NULL)
3603 reason = PyUnicode_FromString(
3604 "mbstowcs() encountered an invalid multibyte sequence");
3605 if (reason == NULL)
3606 return NULL;
3607
3608 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3609 "locale", str, len,
3610 (Py_ssize_t)error_pos,
3611 (Py_ssize_t)(error_pos+1),
3612 reason);
3613 Py_DECREF(reason);
3614 if (exc != NULL) {
3615 PyCodec_StrictErrors(exc);
3616 Py_XDECREF(exc);
3617 }
3618 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003619}
3620
3621PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003622PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003623{
3624 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003625 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003626}
3627
3628
3629PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003630PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003631 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003632 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3633}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003634
Christian Heimes5894ba72007-11-04 11:43:14 +00003635PyObject*
3636PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3637{
Victor Stinner99b95382011-07-04 14:23:54 +02003638#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003639 return PyUnicode_DecodeMBCS(s, size, NULL);
3640#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003641 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003642#else
Victor Stinner793b5312011-04-27 00:24:21 +02003643 PyInterpreterState *interp = PyThreadState_GET()->interp;
3644 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3645 cannot use it to encode and decode filenames before it is loaded. Load
3646 the Python codec requires to encode at least its own filename. Use the C
3647 version of the locale codec until the codec registry is initialized and
3648 the Python codec is loaded.
3649
3650 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3651 cannot only rely on it: check also interp->fscodec_initialized for
3652 subinterpreters. */
3653 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003654 return PyUnicode_Decode(s, size,
3655 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003656 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003657 }
3658 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003659 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003660 }
Victor Stinnerad158722010-10-27 00:25:46 +00003661#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003662}
3663
Martin v. Löwis011e8422009-05-05 04:43:17 +00003664
3665int
Antoine Pitrou13348842012-01-29 18:36:34 +01003666_PyUnicode_HasNULChars(PyObject* s)
3667{
3668 static PyObject *nul = NULL;
3669
3670 if (nul == NULL)
3671 nul = PyUnicode_FromStringAndSize("\0", 1);
3672 if (nul == NULL)
3673 return -1;
3674 return PyUnicode_Contains(s, nul);
3675}
3676
3677
3678int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003679PyUnicode_FSConverter(PyObject* arg, void* addr)
3680{
3681 PyObject *output = NULL;
3682 Py_ssize_t size;
3683 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003684 if (arg == NULL) {
3685 Py_DECREF(*(PyObject**)addr);
3686 return 1;
3687 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003688 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003689 output = arg;
3690 Py_INCREF(output);
3691 }
3692 else {
3693 arg = PyUnicode_FromObject(arg);
3694 if (!arg)
3695 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003696 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003697 Py_DECREF(arg);
3698 if (!output)
3699 return 0;
3700 if (!PyBytes_Check(output)) {
3701 Py_DECREF(output);
3702 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3703 return 0;
3704 }
3705 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003706 size = PyBytes_GET_SIZE(output);
3707 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003708 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003709 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003710 Py_DECREF(output);
3711 return 0;
3712 }
3713 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003714 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003715}
3716
3717
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003718int
3719PyUnicode_FSDecoder(PyObject* arg, void* addr)
3720{
3721 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003722 if (arg == NULL) {
3723 Py_DECREF(*(PyObject**)addr);
3724 return 1;
3725 }
3726 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003727 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003729 output = arg;
3730 Py_INCREF(output);
3731 }
3732 else {
3733 arg = PyBytes_FromObject(arg);
3734 if (!arg)
3735 return 0;
3736 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3737 PyBytes_GET_SIZE(arg));
3738 Py_DECREF(arg);
3739 if (!output)
3740 return 0;
3741 if (!PyUnicode_Check(output)) {
3742 Py_DECREF(output);
3743 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3744 return 0;
3745 }
3746 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003747 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003748 Py_DECREF(output);
3749 return 0;
3750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003752 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003753 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3754 Py_DECREF(output);
3755 return 0;
3756 }
3757 *(PyObject**)addr = output;
3758 return Py_CLEANUP_SUPPORTED;
3759}
3760
3761
Martin v. Löwis5b222132007-06-10 09:51:05 +00003762char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003764{
Christian Heimesf3863112007-11-22 07:46:41 +00003765 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003767 if (!PyUnicode_Check(unicode)) {
3768 PyErr_BadArgument();
3769 return NULL;
3770 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003771 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003772 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003774 if (PyUnicode_UTF8(unicode) == NULL) {
3775 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3777 if (bytes == NULL)
3778 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003779 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3780 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 Py_DECREF(bytes);
3782 return NULL;
3783 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3785 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3786 PyBytes_AS_STRING(bytes),
3787 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 Py_DECREF(bytes);
3789 }
3790
3791 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003792 *psize = PyUnicode_UTF8_LENGTH(unicode);
3793 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003794}
3795
3796char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3800}
3801
3802#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003803static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003804#endif
3805
3806
3807Py_UNICODE *
3808PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810 const unsigned char *one_byte;
3811#if SIZEOF_WCHAR_T == 4
3812 const Py_UCS2 *two_bytes;
3813#else
3814 const Py_UCS4 *four_bytes;
3815 const Py_UCS4 *ucs4_end;
3816 Py_ssize_t num_surrogates;
3817#endif
3818 wchar_t *w;
3819 wchar_t *wchar_end;
3820
3821 if (!PyUnicode_Check(unicode)) {
3822 PyErr_BadArgument();
3823 return NULL;
3824 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003825 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 assert(_PyUnicode_KIND(unicode) != 0);
3828 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829
3830#ifdef Py_DEBUG
3831 ++unicode_as_unicode_calls;
3832#endif
3833
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003836 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3837 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 num_surrogates = 0;
3839
3840 for (; four_bytes < ucs4_end; ++four_bytes) {
3841 if (*four_bytes > 0xFFFF)
3842 ++num_surrogates;
3843 }
3844
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003845 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3846 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3847 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848 PyErr_NoMemory();
3849 return NULL;
3850 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003851 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003853 w = _PyUnicode_WSTR(unicode);
3854 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3855 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3857 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003858 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003860 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3861 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862 }
3863 else
3864 *w = *four_bytes;
3865
3866 if (w > wchar_end) {
3867 assert(0 && "Miscalculated string end");
3868 }
3869 }
3870 *w = 0;
3871#else
3872 /* sizeof(wchar_t) == 4 */
3873 Py_FatalError("Impossible unicode object state, wstr and str "
3874 "should share memory already.");
3875 return NULL;
3876#endif
3877 }
3878 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003879 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3880 (_PyUnicode_LENGTH(unicode) + 1));
3881 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 PyErr_NoMemory();
3883 return NULL;
3884 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003885 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3886 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3887 w = _PyUnicode_WSTR(unicode);
3888 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3891 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 for (; w < wchar_end; ++one_byte, ++w)
3893 *w = *one_byte;
3894 /* null-terminate the wstr */
3895 *w = 0;
3896 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003897 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003899 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900 for (; w < wchar_end; ++two_bytes, ++w)
3901 *w = *two_bytes;
3902 /* null-terminate the wstr */
3903 *w = 0;
3904#else
3905 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003906 PyObject_FREE(_PyUnicode_WSTR(unicode));
3907 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 Py_FatalError("Impossible unicode object state, wstr "
3909 "and str should share memory already.");
3910 return NULL;
3911#endif
3912 }
3913 else {
3914 assert(0 && "This should never happen.");
3915 }
3916 }
3917 }
3918 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003919 *size = PyUnicode_WSTR_LENGTH(unicode);
3920 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003921}
3922
Alexander Belopolsky40018472011-02-26 01:02:56 +00003923Py_UNICODE *
3924PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927}
3928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929
Alexander Belopolsky40018472011-02-26 01:02:56 +00003930Py_ssize_t
3931PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932{
3933 if (!PyUnicode_Check(unicode)) {
3934 PyErr_BadArgument();
3935 goto onError;
3936 }
3937 return PyUnicode_GET_SIZE(unicode);
3938
Benjamin Peterson29060642009-01-31 22:14:21 +00003939 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 return -1;
3941}
3942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943Py_ssize_t
3944PyUnicode_GetLength(PyObject *unicode)
3945{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003946 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 PyErr_BadArgument();
3948 return -1;
3949 }
3950
3951 return PyUnicode_GET_LENGTH(unicode);
3952}
3953
3954Py_UCS4
3955PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3956{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003957 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3958 PyErr_BadArgument();
3959 return (Py_UCS4)-1;
3960 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003961 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003962 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 return (Py_UCS4)-1;
3964 }
3965 return PyUnicode_READ_CHAR(unicode, index);
3966}
3967
3968int
3969PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3970{
3971 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003972 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 return -1;
3974 }
Victor Stinner488fa492011-12-12 00:01:39 +01003975 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003976 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003977 PyErr_SetString(PyExc_IndexError, "string index out of range");
3978 return -1;
3979 }
Victor Stinner488fa492011-12-12 00:01:39 +01003980 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003981 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003982 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3983 PyErr_SetString(PyExc_ValueError, "character out of range");
3984 return -1;
3985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3987 index, ch);
3988 return 0;
3989}
3990
Alexander Belopolsky40018472011-02-26 01:02:56 +00003991const char *
3992PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003993{
Victor Stinner42cb4622010-09-01 19:39:01 +00003994 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003995}
3996
Victor Stinner554f3f02010-06-16 23:33:54 +00003997/* create or adjust a UnicodeDecodeError */
3998static void
3999make_decode_exception(PyObject **exceptionObject,
4000 const char *encoding,
4001 const char *input, Py_ssize_t length,
4002 Py_ssize_t startpos, Py_ssize_t endpos,
4003 const char *reason)
4004{
4005 if (*exceptionObject == NULL) {
4006 *exceptionObject = PyUnicodeDecodeError_Create(
4007 encoding, input, length, startpos, endpos, reason);
4008 }
4009 else {
4010 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4011 goto onError;
4012 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4013 goto onError;
4014 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4015 goto onError;
4016 }
4017 return;
4018
4019onError:
4020 Py_DECREF(*exceptionObject);
4021 *exceptionObject = NULL;
4022}
4023
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024/* error handling callback helper:
4025 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004026 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 and adjust various state variables.
4028 return 0 on success, -1 on error
4029*/
4030
Alexander Belopolsky40018472011-02-26 01:02:56 +00004031static int
4032unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004033 const char *encoding, const char *reason,
4034 const char **input, const char **inend, Py_ssize_t *startinpos,
4035 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004036 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004038 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039
4040 PyObject *restuple = NULL;
4041 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004042 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004043 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 Py_ssize_t requiredsize;
4045 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004046 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 int res = -1;
4048
Victor Stinner596a6c42011-11-09 00:02:18 +01004049 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4050 outsize = PyUnicode_GET_LENGTH(*output);
4051 else
4052 outsize = _PyUnicode_WSTR_LENGTH(*output);
4053
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004055 *errorHandler = PyCodec_LookupError(errors);
4056 if (*errorHandler == NULL)
4057 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 }
4059
Victor Stinner554f3f02010-06-16 23:33:54 +00004060 make_decode_exception(exceptionObject,
4061 encoding,
4062 *input, *inend - *input,
4063 *startinpos, *endinpos,
4064 reason);
4065 if (*exceptionObject == NULL)
4066 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067
4068 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4069 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004072 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004073 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 }
4075 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004076 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004077 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004078 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004079
4080 /* Copy back the bytes variables, which might have been modified by the
4081 callback */
4082 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4083 if (!inputobj)
4084 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004085 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004086 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004087 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004088 *input = PyBytes_AS_STRING(inputobj);
4089 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004090 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004091 /* we can DECREF safely, as the exception has another reference,
4092 so the object won't go away. */
4093 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004094
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004096 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004097 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004098 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4099 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004100 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101
Victor Stinner596a6c42011-11-09 00:02:18 +01004102 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4103 /* need more space? (at least enough for what we
4104 have+the replacement+the rest of the string (starting
4105 at the new input position), so we won't have to check space
4106 when there are no errors in the rest of the string) */
4107 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4108 requiredsize = *outpos + replen + insize-newpos;
4109 if (requiredsize > outsize) {
4110 if (requiredsize<2*outsize)
4111 requiredsize = 2*outsize;
4112 if (unicode_resize(output, requiredsize) < 0)
4113 goto onError;
4114 }
4115 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004117 copy_characters(*output, *outpos, repunicode, 0, replen);
4118 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004120 else {
4121 wchar_t *repwstr;
4122 Py_ssize_t repwlen;
4123 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4124 if (repwstr == NULL)
4125 goto onError;
4126 /* need more space? (at least enough for what we
4127 have+the replacement+the rest of the string (starting
4128 at the new input position), so we won't have to check space
4129 when there are no errors in the rest of the string) */
4130 requiredsize = *outpos + repwlen + insize-newpos;
4131 if (requiredsize > outsize) {
4132 if (requiredsize < 2*outsize)
4133 requiredsize = 2*outsize;
4134 if (unicode_resize(output, requiredsize) < 0)
4135 goto onError;
4136 }
4137 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4138 *outpos += repwlen;
4139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004141 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004142
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 /* we made it! */
4144 res = 0;
4145
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 Py_XDECREF(restuple);
4148 return res;
4149}
4150
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004151/* --- UTF-7 Codec -------------------------------------------------------- */
4152
Antoine Pitrou244651a2009-05-04 18:56:13 +00004153/* See RFC2152 for details. We encode conservatively and decode liberally. */
4154
4155/* Three simple macros defining base-64. */
4156
4157/* Is c a base-64 character? */
4158
4159#define IS_BASE64(c) \
4160 (((c) >= 'A' && (c) <= 'Z') || \
4161 ((c) >= 'a' && (c) <= 'z') || \
4162 ((c) >= '0' && (c) <= '9') || \
4163 (c) == '+' || (c) == '/')
4164
4165/* given that c is a base-64 character, what is its base-64 value? */
4166
4167#define FROM_BASE64(c) \
4168 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4169 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4170 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4171 (c) == '+' ? 62 : 63)
4172
4173/* What is the base-64 character of the bottom 6 bits of n? */
4174
4175#define TO_BASE64(n) \
4176 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4177
4178/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4179 * decoded as itself. We are permissive on decoding; the only ASCII
4180 * byte not decoding to itself is the + which begins a base64
4181 * string. */
4182
4183#define DECODE_DIRECT(c) \
4184 ((c) <= 127 && (c) != '+')
4185
4186/* The UTF-7 encoder treats ASCII characters differently according to
4187 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4188 * the above). See RFC2152. This array identifies these different
4189 * sets:
4190 * 0 : "Set D"
4191 * alphanumeric and '(),-./:?
4192 * 1 : "Set O"
4193 * !"#$%&*;<=>@[]^_`{|}
4194 * 2 : "whitespace"
4195 * ht nl cr sp
4196 * 3 : special (must be base64 encoded)
4197 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4198 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004199
Tim Petersced69f82003-09-16 20:30:58 +00004200static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004201char utf7_category[128] = {
4202/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4203 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4204/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4205 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4206/* sp ! " # $ % & ' ( ) * + , - . / */
4207 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4208/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4210/* @ A B C D E F G H I J K L M N O */
4211 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4212/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4213 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4214/* ` a b c d e f g h i j k l m n o */
4215 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4216/* p q r s t u v w x y z { | } ~ del */
4217 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004218};
4219
Antoine Pitrou244651a2009-05-04 18:56:13 +00004220/* ENCODE_DIRECT: this character should be encoded as itself. The
4221 * answer depends on whether we are encoding set O as itself, and also
4222 * on whether we are encoding whitespace as itself. RFC2152 makes it
4223 * clear that the answers to these questions vary between
4224 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004225
Antoine Pitrou244651a2009-05-04 18:56:13 +00004226#define ENCODE_DIRECT(c, directO, directWS) \
4227 ((c) < 128 && (c) > 0 && \
4228 ((utf7_category[(c)] == 0) || \
4229 (directWS && (utf7_category[(c)] == 2)) || \
4230 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004231
Alexander Belopolsky40018472011-02-26 01:02:56 +00004232PyObject *
4233PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004234 Py_ssize_t size,
4235 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004236{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004237 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4238}
4239
Antoine Pitrou244651a2009-05-04 18:56:13 +00004240/* The decoder. The only state we preserve is our read position,
4241 * i.e. how many characters we have consumed. So if we end in the
4242 * middle of a shift sequence we have to back off the read position
4243 * and the output to the beginning of the sequence, otherwise we lose
4244 * all the shift state (seen bits, number of bits seen, high
4245 * surrogate). */
4246
Alexander Belopolsky40018472011-02-26 01:02:56 +00004247PyObject *
4248PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004249 Py_ssize_t size,
4250 const char *errors,
4251 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004252{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004254 Py_ssize_t startinpos;
4255 Py_ssize_t endinpos;
4256 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004257 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004258 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004259 const char *errmsg = "";
4260 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004261 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004262 unsigned int base64bits = 0;
4263 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004264 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265 PyObject *errorHandler = NULL;
4266 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004267
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004268 /* Start off assuming it's all ASCII. Widen later as necessary. */
4269 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270 if (!unicode)
4271 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004272 if (size == 0) {
4273 if (consumed)
4274 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004275 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004276 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004277
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004278 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004279 e = s + size;
4280
4281 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004282 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004284 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285
Antoine Pitrou244651a2009-05-04 18:56:13 +00004286 if (inShift) { /* in a base-64 section */
4287 if (IS_BASE64(ch)) { /* consume a base-64 character */
4288 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4289 base64bits += 6;
4290 s++;
4291 if (base64bits >= 16) {
4292 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004293 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004294 base64bits -= 16;
4295 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4296 if (surrogate) {
4297 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004298 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4299 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004300 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4301 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004302 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004303 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004304 }
4305 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004306 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4307 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004308 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 }
4310 }
Victor Stinner551ac952011-11-29 22:58:13 +01004311 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004312 /* first surrogate */
4313 surrogate = outCh;
4314 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004316 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4317 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 }
4319 }
4320 }
4321 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322 inShift = 0;
4323 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004325 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4326 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004327 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004329 if (base64bits > 0) { /* left-over bits */
4330 if (base64bits >= 6) {
4331 /* We've seen at least one base-64 character */
4332 errmsg = "partial character in shift sequence";
4333 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335 else {
4336 /* Some bits remain; they should be zero */
4337 if (base64buffer != 0) {
4338 errmsg = "non-zero padding bits in shift sequence";
4339 goto utf7Error;
4340 }
4341 }
4342 }
4343 if (ch != '-') {
4344 /* '-' is absorbed; other terminating
4345 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004346 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4347 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004348 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349 }
4350 }
4351 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 s++; /* consume '+' */
4354 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004356 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4357 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 }
4359 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004361 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004363 }
4364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004366 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4367 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368 s++;
4369 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 else {
4371 startinpos = s-starts;
4372 s++;
4373 errmsg = "unexpected special character";
4374 goto utf7Error;
4375 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004376 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 endinpos = s-starts;
4379 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004380 errors, &errorHandler,
4381 "utf7", errmsg,
4382 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004383 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004384 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 }
4386
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 /* end of string */
4388
4389 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4390 /* if we're in an inconsistent state, that's an error */
4391 if (surrogate ||
4392 (base64bits >= 6) ||
4393 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 endinpos = size;
4395 if (unicode_decode_call_errorhandler(
4396 errors, &errorHandler,
4397 "utf7", "unterminated shift sequence",
4398 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004399 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 goto onError;
4401 if (s < e)
4402 goto restart;
4403 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405
4406 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004407 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004409 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004410 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 }
4412 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004413 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004414 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004415 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004417 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004418 goto onError;
4419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 Py_XDECREF(errorHandler);
4421 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004422 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423
Benjamin Peterson29060642009-01-31 22:14:21 +00004424 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 Py_XDECREF(errorHandler);
4426 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 Py_DECREF(unicode);
4428 return NULL;
4429}
4430
4431
Alexander Belopolsky40018472011-02-26 01:02:56 +00004432PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004433_PyUnicode_EncodeUTF7(PyObject *str,
4434 int base64SetO,
4435 int base64WhiteSpace,
4436 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004438 int kind;
4439 void *data;
4440 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004441 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004442 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004444 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 unsigned int base64bits = 0;
4446 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004447 char * out;
4448 char * start;
4449
Benjamin Petersonbac79492012-01-14 13:34:47 -05004450 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004451 return NULL;
4452 kind = PyUnicode_KIND(str);
4453 data = PyUnicode_DATA(str);
4454 len = PyUnicode_GET_LENGTH(str);
4455
4456 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004459 /* It might be possible to tighten this worst case */
4460 allocated = 8 * len;
4461 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004462 return PyErr_NoMemory();
4463
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 if (v == NULL)
4466 return NULL;
4467
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004468 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004469 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004470 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 if (inShift) {
4473 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4474 /* shifting out */
4475 if (base64bits) { /* output remaining bits */
4476 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4477 base64buffer = 0;
4478 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479 }
4480 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004481 /* Characters not in the BASE64 set implicitly unshift the sequence
4482 so no '-' is required, except if the character is itself a '-' */
4483 if (IS_BASE64(ch) || ch == '-') {
4484 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004485 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486 *out++ = (char) ch;
4487 }
4488 else {
4489 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004490 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004492 else { /* not in a shift sequence */
4493 if (ch == '+') {
4494 *out++ = '+';
4495 *out++ = '-';
4496 }
4497 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4498 *out++ = (char) ch;
4499 }
4500 else {
4501 *out++ = '+';
4502 inShift = 1;
4503 goto encode_char;
4504 }
4505 }
4506 continue;
4507encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004509 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004510
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 /* code first surrogate */
4512 base64bits += 16;
4513 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4514 while (base64bits >= 6) {
4515 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4516 base64bits -= 6;
4517 }
4518 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004519 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 base64bits += 16;
4522 base64buffer = (base64buffer << 16) | ch;
4523 while (base64bits >= 6) {
4524 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4525 base64bits -= 6;
4526 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004527 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528 if (base64bits)
4529 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4530 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004532 if (_PyBytes_Resize(&v, out - start) < 0)
4533 return NULL;
4534 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004536PyObject *
4537PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4538 Py_ssize_t size,
4539 int base64SetO,
4540 int base64WhiteSpace,
4541 const char *errors)
4542{
4543 PyObject *result;
4544 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4545 if (tmp == NULL)
4546 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004547 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004548 base64WhiteSpace, errors);
4549 Py_DECREF(tmp);
4550 return result;
4551}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004552
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553#undef IS_BASE64
4554#undef FROM_BASE64
4555#undef TO_BASE64
4556#undef DECODE_DIRECT
4557#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559/* --- UTF-8 Codec -------------------------------------------------------- */
4560
Tim Petersced69f82003-09-16 20:30:58 +00004561static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004563 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4564 illegal prefix. See RFC 3629 for details */
4565 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4566 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4570 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4571 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004572 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4573 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4575 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004576 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4577 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4578 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4579 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4580 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581};
4582
Alexander Belopolsky40018472011-02-26 01:02:56 +00004583PyObject *
4584PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004585 Py_ssize_t size,
4586 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587{
Walter Dörwald69652032004-09-07 20:24:22 +00004588 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4589}
4590
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004591#include "stringlib/ucs1lib.h"
4592#include "stringlib/codecs.h"
4593#include "stringlib/undef.h"
4594
4595#include "stringlib/ucs2lib.h"
4596#include "stringlib/codecs.h"
4597#include "stringlib/undef.h"
4598
4599#include "stringlib/ucs4lib.h"
4600#include "stringlib/codecs.h"
4601#include "stringlib/undef.h"
4602
Antoine Pitrouab868312009-01-10 15:40:25 +00004603/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4604#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4605
4606/* Mask to quickly check whether a C 'long' contains a
4607 non-ASCII, UTF8-encoded char. */
4608#if (SIZEOF_LONG == 8)
4609# define ASCII_CHAR_MASK 0x8080808080808080L
4610#elif (SIZEOF_LONG == 4)
4611# define ASCII_CHAR_MASK 0x80808080L
4612#else
4613# error C 'long' size should be either 4 or 8!
4614#endif
4615
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004616/* Scans a UTF-8 string and returns the maximum character to be expected
4617 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004618
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004619 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004620 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004621 */
4622static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004623utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004625 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004626 const unsigned char *end = p + string_size;
4627 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004628
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004629 assert(unicode_size != NULL);
4630
4631 /* By having a cascade of independent loops which fallback onto each
4632 other, we minimize the amount of work done in the average loop
4633 iteration, and we also maximize the CPU's ability to predict
4634 branches correctly (because a given condition will have always the
4635 same boolean outcome except perhaps in the last iteration of the
4636 corresponding loop).
4637 In the general case this brings us rather close to decoding
4638 performance pre-PEP 393, despite the two-pass decoding.
4639
4640 Note that the pure ASCII loop is not duplicated once a non-ASCII
4641 character has been encountered. It is actually a pessimization (by
4642 a significant factor) to use this loop on text with many non-ASCII
4643 characters, and it is important to avoid bad performance on valid
4644 utf-8 data (invalid utf-8 being a different can of worms).
4645 */
4646
4647 /* ASCII */
4648 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004649 /* Only check value if it's not a ASCII char... */
4650 if (*p < 0x80) {
4651 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4652 an explanation. */
4653 if (!((size_t) p & LONG_PTR_MASK)) {
4654 /* Help register allocation */
4655 register const unsigned char *_p = p;
4656 while (_p < aligned_end) {
4657 unsigned long value = *(unsigned long *) _p;
4658 if (value & ASCII_CHAR_MASK)
4659 break;
4660 _p += SIZEOF_LONG;
4661 char_count += SIZEOF_LONG;
4662 }
4663 p = _p;
4664 if (p == end)
4665 break;
4666 }
4667 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004668 if (*p < 0x80)
4669 ++char_count;
4670 else
4671 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004672 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004673 *unicode_size = char_count;
4674 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004675
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004676_ucs1loop:
4677 for (; p < end; ++p) {
4678 if (*p < 0xc4)
4679 char_count += ((*p & 0xc0) != 0x80);
4680 else
4681 goto _ucs2loop;
4682 }
4683 *unicode_size = char_count;
4684 return 255;
4685
4686_ucs2loop:
4687 for (; p < end; ++p) {
4688 if (*p < 0xf0)
4689 char_count += ((*p & 0xc0) != 0x80);
4690 else
4691 goto _ucs4loop;
4692 }
4693 *unicode_size = char_count;
4694 return 65535;
4695
4696_ucs4loop:
4697 for (; p < end; ++p) {
4698 char_count += ((*p & 0xc0) != 0x80);
4699 }
4700 *unicode_size = char_count;
4701 return 65537;
4702}
4703
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004704/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004705 in case of errors. Implicit parameters: unicode, kind, data, onError.
4706 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004707*/
Victor Stinner785938e2011-12-11 20:09:03 +01004708#define WRITE_MAYBE_FAIL(index, value) \
4709 do { \
4710 Py_ssize_t pos = index; \
4711 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4712 unicode_resize(&unicode, pos + pos/8) < 0) \
4713 goto onError; \
4714 if (unicode_putchar(&unicode, &pos, value) < 0) \
4715 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004716 } while (0)
4717
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004718static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004719decode_utf8_errors(const char *starts,
4720 Py_ssize_t size,
4721 const char *errors,
4722 Py_ssize_t *consumed,
4723 const char *s,
4724 PyObject *unicode,
4725 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004726{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004728 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004729 Py_ssize_t startinpos;
4730 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004731 const char *e = starts + size;
4732 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004733 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 PyObject *errorHandler = NULL;
4735 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004736
Antoine Pitrouab868312009-01-10 15:40:25 +00004737 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738
4739 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004740 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741
4742 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004743 /* Fast path for runs of ASCII characters. Given that common UTF-8
4744 input will consist of an overwhelming majority of ASCII
4745 characters, we try to optimize for this case by checking
4746 as many characters as a C 'long' can contain.
4747 First, check if we can do an aligned read, as most CPUs have
4748 a penalty for unaligned reads.
4749 */
4750 if (!((size_t) s & LONG_PTR_MASK)) {
4751 /* Help register allocation */
4752 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004753 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004754 while (_s < aligned_end) {
4755 /* Read a whole long at a time (either 4 or 8 bytes),
4756 and do a fast unrolled copy if it only contains ASCII
4757 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004758 unsigned long value = *(unsigned long *) _s;
4759 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004760 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004761 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4762 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4763 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4764 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004765#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004766 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4767 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4768 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4769 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004770#endif
4771 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004772 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004773 }
4774 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004775 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004776 if (s == e)
4777 break;
4778 ch = (unsigned char)*s;
4779 }
4780 }
4781
4782 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004783 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 s++;
4785 continue;
4786 }
4787
4788 n = utf8_code_length[ch];
4789
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004790 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 if (consumed)
4792 break;
4793 else {
4794 errmsg = "unexpected end of data";
4795 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004796 endinpos = startinpos+1;
4797 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4798 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004799 goto utf8Error;
4800 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802
4803 switch (n) {
4804
4805 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004806 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004807 startinpos = s-starts;
4808 endinpos = startinpos+1;
4809 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810
4811 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004812 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 startinpos = s-starts;
4814 endinpos = startinpos+1;
4815 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816
4817 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004818 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004819 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004821 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 goto utf8Error;
4823 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004825 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004826 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 break;
4828
4829 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004830 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4831 will result in surrogates in range d800-dfff. Surrogates are
4832 not valid UTF-8 so they are rejected.
4833 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4834 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004835 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004836 (s[2] & 0xc0) != 0x80 ||
4837 ((unsigned char)s[0] == 0xE0 &&
4838 (unsigned char)s[1] < 0xA0) ||
4839 ((unsigned char)s[0] == 0xED &&
4840 (unsigned char)s[1] > 0x9F)) {
4841 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004843 endinpos = startinpos + 1;
4844
4845 /* if s[1] first two bits are 1 and 0, then the invalid
4846 continuation byte is s[2], so increment endinpos by 1,
4847 if not, s[1] is invalid and endinpos doesn't need to
4848 be incremented. */
4849 if ((s[1] & 0xC0) == 0x80)
4850 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 goto utf8Error;
4852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004854 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004855 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004856 break;
4857
4858 case 4:
4859 if ((s[1] & 0xc0) != 0x80 ||
4860 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004861 (s[3] & 0xc0) != 0x80 ||
4862 ((unsigned char)s[0] == 0xF0 &&
4863 (unsigned char)s[1] < 0x90) ||
4864 ((unsigned char)s[0] == 0xF4 &&
4865 (unsigned char)s[1] > 0x8F)) {
4866 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004868 endinpos = startinpos + 1;
4869 if ((s[1] & 0xC0) == 0x80) {
4870 endinpos++;
4871 if ((s[2] & 0xC0) == 0x80)
4872 endinpos++;
4873 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004874 goto utf8Error;
4875 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004876 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004877 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004878 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004879
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004880 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 }
4883 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004885
Benjamin Peterson29060642009-01-31 22:14:21 +00004886 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 if (unicode_decode_call_errorhandler(
4888 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004889 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004890 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004891 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893 /* Update data because unicode_decode_call_errorhandler might have
4894 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 }
Walter Dörwald69652032004-09-07 20:24:22 +00004897 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004900 /* Adjust length and ready string when it contained errors and
4901 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004902 if (unicode_resize(&unicode, i) < 0)
4903 goto onError;
4904 unicode_adjust_maxchar(&unicode);
4905 if (unicode == NULL)
4906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 Py_XDECREF(errorHandler);
4909 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004910 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004911 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 Py_XDECREF(errorHandler);
4915 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004916 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 return NULL;
4918}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004919#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004920
Victor Stinner785938e2011-12-11 20:09:03 +01004921PyObject *
4922PyUnicode_DecodeUTF8Stateful(const char *s,
4923 Py_ssize_t size,
4924 const char *errors,
4925 Py_ssize_t *consumed)
4926{
4927 Py_UCS4 maxchar = 0;
4928 Py_ssize_t unicode_size;
4929 int has_errors = 0;
4930 PyObject *unicode;
4931 int kind;
4932 void *data;
4933 const char *starts = s;
4934 const char *e;
4935 Py_ssize_t i;
4936
4937 if (size == 0) {
4938 if (consumed)
4939 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004940 Py_INCREF(unicode_empty);
4941 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004942 }
4943
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004944 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004945
4946 /* When the string is ASCII only, just use memcpy and return.
4947 unicode_size may be != size if there is an incomplete UTF-8
4948 sequence at the end of the ASCII block. */
4949 if (maxchar < 128 && size == unicode_size) {
4950 if (consumed)
4951 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004952 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004953 }
4954
4955 unicode = PyUnicode_New(unicode_size, maxchar);
4956 if (!unicode)
4957 return NULL;
4958 kind = PyUnicode_KIND(unicode);
4959 data = PyUnicode_DATA(unicode);
4960
4961 /* Unpack UTF-8 encoded data */
4962 i = 0;
4963 e = starts + size;
4964 switch (kind) {
4965 case PyUnicode_1BYTE_KIND:
4966 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4967 break;
4968 case PyUnicode_2BYTE_KIND:
4969 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4970 break;
4971 case PyUnicode_4BYTE_KIND:
4972 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4973 break;
4974 }
4975 if (!has_errors) {
4976 /* Ensure the unicode size calculation was correct */
4977 assert(i == unicode_size);
4978 assert(s == e);
4979 if (consumed)
4980 *consumed = size;
4981 return unicode;
4982 }
4983
4984 /* In case of errors, maxchar and size computation might be incorrect;
4985 code below refits and resizes as necessary. */
4986 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4987}
4988
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004989#ifdef __APPLE__
4990
4991/* Simplified UTF-8 decoder using surrogateescape error handler,
4992 used to decode the command line arguments on Mac OS X. */
4993
4994wchar_t*
4995_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4996{
4997 int n;
4998 const char *e;
4999 wchar_t *unicode, *p;
5000
5001 /* Note: size will always be longer than the resulting Unicode
5002 character count */
5003 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5004 PyErr_NoMemory();
5005 return NULL;
5006 }
5007 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5008 if (!unicode)
5009 return NULL;
5010
5011 /* Unpack UTF-8 encoded data */
5012 p = unicode;
5013 e = s + size;
5014 while (s < e) {
5015 Py_UCS4 ch = (unsigned char)*s;
5016
5017 if (ch < 0x80) {
5018 *p++ = (wchar_t)ch;
5019 s++;
5020 continue;
5021 }
5022
5023 n = utf8_code_length[ch];
5024 if (s + n > e) {
5025 goto surrogateescape;
5026 }
5027
5028 switch (n) {
5029 case 0:
5030 case 1:
5031 goto surrogateescape;
5032
5033 case 2:
5034 if ((s[1] & 0xc0) != 0x80)
5035 goto surrogateescape;
5036 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5037 assert ((ch > 0x007F) && (ch <= 0x07FF));
5038 *p++ = (wchar_t)ch;
5039 break;
5040
5041 case 3:
5042 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5043 will result in surrogates in range d800-dfff. Surrogates are
5044 not valid UTF-8 so they are rejected.
5045 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5046 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5047 if ((s[1] & 0xc0) != 0x80 ||
5048 (s[2] & 0xc0) != 0x80 ||
5049 ((unsigned char)s[0] == 0xE0 &&
5050 (unsigned char)s[1] < 0xA0) ||
5051 ((unsigned char)s[0] == 0xED &&
5052 (unsigned char)s[1] > 0x9F)) {
5053
5054 goto surrogateescape;
5055 }
5056 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5057 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005058 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005059 break;
5060
5061 case 4:
5062 if ((s[1] & 0xc0) != 0x80 ||
5063 (s[2] & 0xc0) != 0x80 ||
5064 (s[3] & 0xc0) != 0x80 ||
5065 ((unsigned char)s[0] == 0xF0 &&
5066 (unsigned char)s[1] < 0x90) ||
5067 ((unsigned char)s[0] == 0xF4 &&
5068 (unsigned char)s[1] > 0x8F)) {
5069 goto surrogateescape;
5070 }
5071 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5072 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005073 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074
5075#if SIZEOF_WCHAR_T == 4
5076 *p++ = (wchar_t)ch;
5077#else
5078 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005079 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5080 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005081#endif
5082 break;
5083 }
5084 s += n;
5085 continue;
5086
5087 surrogateescape:
5088 *p++ = 0xDC00 + ch;
5089 s++;
5090 }
5091 *p = L'\0';
5092 return unicode;
5093}
5094
5095#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005097/* Primary internal function which creates utf8 encoded bytes objects.
5098
5099 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005100 and allocate exactly as much space needed at the end. Else allocate the
5101 maximum possible needed (4 result bytes per Unicode character), and return
5102 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005103*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005104PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005105_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106{
Victor Stinner6099a032011-12-18 14:22:26 +01005107 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005108 void *data;
5109 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005111 if (!PyUnicode_Check(unicode)) {
5112 PyErr_BadArgument();
5113 return NULL;
5114 }
5115
5116 if (PyUnicode_READY(unicode) == -1)
5117 return NULL;
5118
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005119 if (PyUnicode_UTF8(unicode))
5120 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5121 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005122
5123 kind = PyUnicode_KIND(unicode);
5124 data = PyUnicode_DATA(unicode);
5125 size = PyUnicode_GET_LENGTH(unicode);
5126
Benjamin Petersonead6b532011-12-20 17:23:42 -06005127 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005128 default:
5129 assert(0);
5130 case PyUnicode_1BYTE_KIND:
5131 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5132 assert(!PyUnicode_IS_ASCII(unicode));
5133 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5134 case PyUnicode_2BYTE_KIND:
5135 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5136 case PyUnicode_4BYTE_KIND:
5137 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139}
5140
Alexander Belopolsky40018472011-02-26 01:02:56 +00005141PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005142PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5143 Py_ssize_t size,
5144 const char *errors)
5145{
5146 PyObject *v, *unicode;
5147
5148 unicode = PyUnicode_FromUnicode(s, size);
5149 if (unicode == NULL)
5150 return NULL;
5151 v = _PyUnicode_AsUTF8String(unicode, errors);
5152 Py_DECREF(unicode);
5153 return v;
5154}
5155
5156PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005157PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005159 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160}
5161
Walter Dörwald41980ca2007-08-16 21:55:45 +00005162/* --- UTF-32 Codec ------------------------------------------------------- */
5163
5164PyObject *
5165PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 Py_ssize_t size,
5167 const char *errors,
5168 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005169{
5170 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5171}
5172
5173PyObject *
5174PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 Py_ssize_t size,
5176 const char *errors,
5177 int *byteorder,
5178 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179{
5180 const char *starts = s;
5181 Py_ssize_t startinpos;
5182 Py_ssize_t endinpos;
5183 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005184 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005185 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005186 int bo = 0; /* assume native ordering by default */
5187 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005188 /* Offsets from q for retrieving bytes in the right order. */
5189#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5190 int iorder[] = {0, 1, 2, 3};
5191#else
5192 int iorder[] = {3, 2, 1, 0};
5193#endif
5194 PyObject *errorHandler = NULL;
5195 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005196
Walter Dörwald41980ca2007-08-16 21:55:45 +00005197 q = (unsigned char *)s;
5198 e = q + size;
5199
5200 if (byteorder)
5201 bo = *byteorder;
5202
5203 /* Check for BOM marks (U+FEFF) in the input and adjust current
5204 byte order setting accordingly. In native mode, the leading BOM
5205 mark is skipped, in all other modes, it is copied to the output
5206 stream as-is (giving a ZWNBSP character). */
5207 if (bo == 0) {
5208 if (size >= 4) {
5209 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005211#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 if (bom == 0x0000FEFF) {
5213 q += 4;
5214 bo = -1;
5215 }
5216 else if (bom == 0xFFFE0000) {
5217 q += 4;
5218 bo = 1;
5219 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005220#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 if (bom == 0x0000FEFF) {
5222 q += 4;
5223 bo = 1;
5224 }
5225 else if (bom == 0xFFFE0000) {
5226 q += 4;
5227 bo = -1;
5228 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005229#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005231 }
5232
5233 if (bo == -1) {
5234 /* force LE */
5235 iorder[0] = 0;
5236 iorder[1] = 1;
5237 iorder[2] = 2;
5238 iorder[3] = 3;
5239 }
5240 else if (bo == 1) {
5241 /* force BE */
5242 iorder[0] = 3;
5243 iorder[1] = 2;
5244 iorder[2] = 1;
5245 iorder[3] = 0;
5246 }
5247
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005248 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005249 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005250 if (!unicode)
5251 return NULL;
5252 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005253 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005254 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005255
Walter Dörwald41980ca2007-08-16 21:55:45 +00005256 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 Py_UCS4 ch;
5258 /* remaining bytes at the end? (size should be divisible by 4) */
5259 if (e-q<4) {
5260 if (consumed)
5261 break;
5262 errmsg = "truncated data";
5263 startinpos = ((const char *)q)-starts;
5264 endinpos = ((const char *)e)-starts;
5265 goto utf32Error;
5266 /* The remaining input chars are ignored if the callback
5267 chooses to skip the input */
5268 }
5269 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5270 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005271
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 if (ch >= 0x110000)
5273 {
5274 errmsg = "codepoint not in range(0x110000)";
5275 startinpos = ((const char *)q)-starts;
5276 endinpos = startinpos+4;
5277 goto utf32Error;
5278 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005279 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5280 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 q += 4;
5282 continue;
5283 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 if (unicode_decode_call_errorhandler(
5285 errors, &errorHandler,
5286 "utf32", errmsg,
5287 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005288 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005290 }
5291
5292 if (byteorder)
5293 *byteorder = bo;
5294
5295 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005297
5298 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005299 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005300 goto onError;
5301
5302 Py_XDECREF(errorHandler);
5303 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005304 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005305
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005307 Py_DECREF(unicode);
5308 Py_XDECREF(errorHandler);
5309 Py_XDECREF(exc);
5310 return NULL;
5311}
5312
5313PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005314_PyUnicode_EncodeUTF32(PyObject *str,
5315 const char *errors,
5316 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005317{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005318 int kind;
5319 void *data;
5320 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005321 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005322 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005323 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005324 /* Offsets from p for storing byte pairs in the right order. */
5325#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5326 int iorder[] = {0, 1, 2, 3};
5327#else
5328 int iorder[] = {3, 2, 1, 0};
5329#endif
5330
Benjamin Peterson29060642009-01-31 22:14:21 +00005331#define STORECHAR(CH) \
5332 do { \
5333 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5334 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5335 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5336 p[iorder[0]] = (CH) & 0xff; \
5337 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005338 } while(0)
5339
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005340 if (!PyUnicode_Check(str)) {
5341 PyErr_BadArgument();
5342 return NULL;
5343 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005344 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005345 return NULL;
5346 kind = PyUnicode_KIND(str);
5347 data = PyUnicode_DATA(str);
5348 len = PyUnicode_GET_LENGTH(str);
5349
5350 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005351 bytesize = nsize * 4;
5352 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005354 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355 if (v == NULL)
5356 return NULL;
5357
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005358 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005359 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005361 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005362 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363
5364 if (byteorder == -1) {
5365 /* force LE */
5366 iorder[0] = 0;
5367 iorder[1] = 1;
5368 iorder[2] = 2;
5369 iorder[3] = 3;
5370 }
5371 else if (byteorder == 1) {
5372 /* force BE */
5373 iorder[0] = 3;
5374 iorder[1] = 2;
5375 iorder[2] = 1;
5376 iorder[3] = 0;
5377 }
5378
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 for (i = 0; i < len; i++)
5380 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005381
5382 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005383 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005384#undef STORECHAR
5385}
5386
Alexander Belopolsky40018472011-02-26 01:02:56 +00005387PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005388PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5389 Py_ssize_t size,
5390 const char *errors,
5391 int byteorder)
5392{
5393 PyObject *result;
5394 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5395 if (tmp == NULL)
5396 return NULL;
5397 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5398 Py_DECREF(tmp);
5399 return result;
5400}
5401
5402PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005403PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005404{
Victor Stinnerb960b342011-11-20 19:12:52 +01005405 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005406}
5407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408/* --- UTF-16 Codec ------------------------------------------------------- */
5409
Tim Peters772747b2001-08-09 22:21:55 +00005410PyObject *
5411PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 Py_ssize_t size,
5413 const char *errors,
5414 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415{
Walter Dörwald69652032004-09-07 20:24:22 +00005416 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5417}
5418
Antoine Pitrouab868312009-01-10 15:40:25 +00005419/* Two masks for fast checking of whether a C 'long' may contain
5420 UTF16-encoded surrogate characters. This is an efficient heuristic,
5421 assuming that non-surrogate characters with a code point >= 0x8000 are
5422 rare in most input.
5423 FAST_CHAR_MASK is used when the input is in native byte ordering,
5424 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005425*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005426#if (SIZEOF_LONG == 8)
5427# define FAST_CHAR_MASK 0x8000800080008000L
5428# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005429# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005430#elif (SIZEOF_LONG == 4)
5431# define FAST_CHAR_MASK 0x80008000L
5432# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005433# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005434#else
5435# error C 'long' size should be either 4 or 8!
5436#endif
5437
Walter Dörwald69652032004-09-07 20:24:22 +00005438PyObject *
5439PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 Py_ssize_t size,
5441 const char *errors,
5442 int *byteorder,
5443 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005444{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005446 Py_ssize_t startinpos;
5447 Py_ssize_t endinpos;
5448 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005449 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005450 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005451 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005452 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005453 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005454 /* Offsets from q for retrieving byte pairs in the right order. */
5455#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5456 int ihi = 1, ilo = 0;
5457#else
5458 int ihi = 0, ilo = 1;
5459#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 PyObject *errorHandler = NULL;
5461 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462
5463 /* Note: size will always be longer than the resulting Unicode
5464 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005465 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 if (!unicode)
5467 return NULL;
5468 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005469 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005470 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471
Tim Peters772747b2001-08-09 22:21:55 +00005472 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005473 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
5475 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005476 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005478 /* Check for BOM marks (U+FEFF) in the input and adjust current
5479 byte order setting accordingly. In native mode, the leading BOM
5480 mark is skipped, in all other modes, it is copied to the output
5481 stream as-is (giving a ZWNBSP character). */
5482 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005483 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005484 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005485#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 if (bom == 0xFEFF) {
5487 q += 2;
5488 bo = -1;
5489 }
5490 else if (bom == 0xFFFE) {
5491 q += 2;
5492 bo = 1;
5493 }
Tim Petersced69f82003-09-16 20:30:58 +00005494#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 if (bom == 0xFEFF) {
5496 q += 2;
5497 bo = 1;
5498 }
5499 else if (bom == 0xFFFE) {
5500 q += 2;
5501 bo = -1;
5502 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005503#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005505 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506
Tim Peters772747b2001-08-09 22:21:55 +00005507 if (bo == -1) {
5508 /* force LE */
5509 ihi = 1;
5510 ilo = 0;
5511 }
5512 else if (bo == 1) {
5513 /* force BE */
5514 ihi = 0;
5515 ilo = 1;
5516 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5518 native_ordering = ilo < ihi;
5519#else
5520 native_ordering = ilo > ihi;
5521#endif
Tim Peters772747b2001-08-09 22:21:55 +00005522
Antoine Pitrouab868312009-01-10 15:40:25 +00005523 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005524 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005525 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005526 /* First check for possible aligned read of a C 'long'. Unaligned
5527 reads are more expensive, better to defer to another iteration. */
5528 if (!((size_t) q & LONG_PTR_MASK)) {
5529 /* Fast path for runs of non-surrogate chars. */
5530 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005531 int kind = PyUnicode_KIND(unicode);
5532 void *data = PyUnicode_DATA(unicode);
5533 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005534 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005535 Py_UCS4 maxch;
5536 if (native_ordering) {
5537 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005538 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005539 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005540 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005541 else {
5542 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005543 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005544 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005545 block = ((block >> 8) & STRIPPED_MASK) |
5546 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005547 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005548 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005549#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005550 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
5551 maxch = Py_MAX(maxch, ch);
5552 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
5553 maxch = Py_MAX(maxch, ch);
5554 ch = (Py_UCS2)(block >> 48);
5555 maxch = Py_MAX(maxch, ch);
5556#else
5557 ch = (Py_UCS2)(block >> 16);
5558 maxch = Py_MAX(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559#endif
5560 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5561 if (unicode_widen(&unicode, maxch) < 0)
5562 goto onError;
5563 kind = PyUnicode_KIND(unicode);
5564 data = PyUnicode_DATA(unicode);
5565 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5567 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005568#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005569 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5570 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5571 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5572#else
5573 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5574#endif
5575#else
5576#if SIZEOF_LONG == 8
5577 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5578 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5579 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5580#else
5581 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5582#endif
5583 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005584#endif
5585 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005586 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005587 q = _q;
5588 if (q >= e)
5589 break;
5590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005592
Benjamin Peterson14339b62009-01-31 16:36:08 +00005593 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005594
Victor Stinner551ac952011-11-29 22:58:13 +01005595 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005596 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5597 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 continue;
5599 }
5600
5601 /* UTF-16 code pair: */
5602 if (q > e) {
5603 errmsg = "unexpected end of data";
5604 startinpos = (((const char *)q) - 2) - starts;
5605 endinpos = ((const char *)e) + 1 - starts;
5606 goto utf16Error;
5607 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005608 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5609 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005611 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005612 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005613 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005614 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 continue;
5616 }
5617 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005618 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 startinpos = (((const char *)q)-4)-starts;
5620 endinpos = startinpos+2;
5621 goto utf16Error;
5622 }
5623
Benjamin Peterson14339b62009-01-31 16:36:08 +00005624 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 errmsg = "illegal encoding";
5626 startinpos = (((const char *)q)-2)-starts;
5627 endinpos = startinpos+2;
5628 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005629
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005632 errors,
5633 &errorHandler,
5634 "utf16", errmsg,
5635 &starts,
5636 (const char **)&e,
5637 &startinpos,
5638 &endinpos,
5639 &exc,
5640 (const char **)&q,
5641 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005642 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005645 /* remaining byte at the end? (size should be even) */
5646 if (e == q) {
5647 if (!consumed) {
5648 errmsg = "truncated data";
5649 startinpos = ((const char *)q) - starts;
5650 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005651 if (unicode_decode_call_errorhandler(
5652 errors,
5653 &errorHandler,
5654 "utf16", errmsg,
5655 &starts,
5656 (const char **)&e,
5657 &startinpos,
5658 &endinpos,
5659 &exc,
5660 (const char **)&q,
5661 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005662 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005663 goto onError;
5664 /* The remaining input chars are ignored if the callback
5665 chooses to skip the input */
5666 }
5667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668
5669 if (byteorder)
5670 *byteorder = bo;
5671
Walter Dörwald69652032004-09-07 20:24:22 +00005672 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005674
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005676 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 goto onError;
5678
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 Py_XDECREF(errorHandler);
5680 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005681 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 Py_XDECREF(errorHandler);
5686 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 return NULL;
5688}
5689
Antoine Pitrouab868312009-01-10 15:40:25 +00005690#undef FAST_CHAR_MASK
5691#undef SWAPPED_FAST_CHAR_MASK
5692
Tim Peters772747b2001-08-09 22:21:55 +00005693PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005694_PyUnicode_EncodeUTF16(PyObject *str,
5695 const char *errors,
5696 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005698 int kind;
5699 void *data;
5700 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005701 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005702 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005703 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005704 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005705 /* Offsets from p for storing byte pairs in the right order. */
5706#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5707 int ihi = 1, ilo = 0;
5708#else
5709 int ihi = 0, ilo = 1;
5710#endif
5711
Benjamin Peterson29060642009-01-31 22:14:21 +00005712#define STORECHAR(CH) \
5713 do { \
5714 p[ihi] = ((CH) >> 8) & 0xff; \
5715 p[ilo] = (CH) & 0xff; \
5716 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005717 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719 if (!PyUnicode_Check(str)) {
5720 PyErr_BadArgument();
5721 return NULL;
5722 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005723 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005724 return NULL;
5725 kind = PyUnicode_KIND(str);
5726 data = PyUnicode_DATA(str);
5727 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005728
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729 pairs = 0;
5730 if (kind == PyUnicode_4BYTE_KIND)
5731 for (i = 0; i < len; i++)
5732 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5733 pairs++;
5734 /* 2 * (len + pairs + (byteorder == 0)) */
5735 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005737 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005738 bytesize = nsize * 2;
5739 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005741 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 if (v == NULL)
5743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005745 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005749 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005750
5751 if (byteorder == -1) {
5752 /* force LE */
5753 ihi = 1;
5754 ilo = 0;
5755 }
5756 else if (byteorder == 1) {
5757 /* force BE */
5758 ihi = 0;
5759 ilo = 1;
5760 }
5761
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005762 for (i = 0; i < len; i++) {
5763 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5764 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005766 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5767 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 }
Tim Peters772747b2001-08-09 22:21:55 +00005769 STORECHAR(ch);
5770 if (ch2)
5771 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005772 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005773
5774 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005775 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005776#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777}
5778
Alexander Belopolsky40018472011-02-26 01:02:56 +00005779PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005780PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5781 Py_ssize_t size,
5782 const char *errors,
5783 int byteorder)
5784{
5785 PyObject *result;
5786 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5787 if (tmp == NULL)
5788 return NULL;
5789 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5790 Py_DECREF(tmp);
5791 return result;
5792}
5793
5794PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005795PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005797 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798}
5799
5800/* --- Unicode Escape Codec ----------------------------------------------- */
5801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005802/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5803 if all the escapes in the string make it still a valid ASCII string.
5804 Returns -1 if any escapes were found which cause the string to
5805 pop out of ASCII range. Otherwise returns the length of the
5806 required buffer to hold the string.
5807 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005808static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005809length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5810{
5811 const unsigned char *p = (const unsigned char *)s;
5812 const unsigned char *end = p + size;
5813 Py_ssize_t length = 0;
5814
5815 if (size < 0)
5816 return -1;
5817
5818 for (; p < end; ++p) {
5819 if (*p > 127) {
5820 /* Non-ASCII */
5821 return -1;
5822 }
5823 else if (*p != '\\') {
5824 /* Normal character */
5825 ++length;
5826 }
5827 else {
5828 /* Backslash-escape, check next char */
5829 ++p;
5830 /* Escape sequence reaches till end of string or
5831 non-ASCII follow-up. */
5832 if (p >= end || *p > 127)
5833 return -1;
5834 switch (*p) {
5835 case '\n':
5836 /* backslash + \n result in zero characters */
5837 break;
5838 case '\\': case '\'': case '\"':
5839 case 'b': case 'f': case 't':
5840 case 'n': case 'r': case 'v': case 'a':
5841 ++length;
5842 break;
5843 case '0': case '1': case '2': case '3':
5844 case '4': case '5': case '6': case '7':
5845 case 'x': case 'u': case 'U': case 'N':
5846 /* these do not guarantee ASCII characters */
5847 return -1;
5848 default:
5849 /* count the backslash + the other character */
5850 length += 2;
5851 }
5852 }
5853 }
5854 return length;
5855}
5856
Fredrik Lundh06d12682001-01-24 07:59:11 +00005857static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005858
Alexander Belopolsky40018472011-02-26 01:02:56 +00005859PyObject *
5860PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005861 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005862 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005865 Py_ssize_t startinpos;
5866 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005867 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005868 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005870 char* message;
5871 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872 PyObject *errorHandler = NULL;
5873 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005874 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005875 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005876
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005877 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005878
5879 /* After length_of_escaped_ascii_string() there are two alternatives,
5880 either the string is pure ASCII with named escapes like \n, etc.
5881 and we determined it's exact size (common case)
5882 or it contains \x, \u, ... escape sequences. then we create a
5883 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005884 if (len >= 0) {
5885 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005886 if (!v)
5887 goto onError;
5888 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005889 }
5890 else {
5891 /* Escaped strings will always be longer than the resulting
5892 Unicode string, so we start with size here and then reduce the
5893 length after conversion to the true value.
5894 (but if the error callback returns a long replacement string
5895 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005896 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005897 if (!v)
5898 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005899 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005900 }
5901
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005903 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005904 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005906
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 while (s < end) {
5908 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005909 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005912 /* The only case in which i == ascii_length is a backslash
5913 followed by a newline. */
5914 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005915
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 /* Non-escape characters are interpreted as Unicode ordinals */
5917 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005918 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5919 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 continue;
5921 }
5922
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005923 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 /* \ - Escapes */
5925 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005926 c = *s++;
5927 if (s > end)
5928 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005929
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 /* The only case in which i == ascii_length is a backslash
5931 followed by a newline. */
5932 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005933
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005934 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005937#define WRITECHAR(ch) \
5938 do { \
5939 if (unicode_putchar(&v, &i, ch) < 0) \
5940 goto onError; \
5941 }while(0)
5942
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005944 case '\\': WRITECHAR('\\'); break;
5945 case '\'': WRITECHAR('\''); break;
5946 case '\"': WRITECHAR('\"'); break;
5947 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005949 case 'f': WRITECHAR('\014'); break;
5950 case 't': WRITECHAR('\t'); break;
5951 case 'n': WRITECHAR('\n'); break;
5952 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005953 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005954 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005955 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005956 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 case '0': case '1': case '2': case '3':
5960 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005961 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005962 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005963 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005964 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005965 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005967 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 break;
5969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 /* hex escapes */
5971 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005973 digits = 2;
5974 message = "truncated \\xXX escape";
5975 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005979 digits = 4;
5980 message = "truncated \\uXXXX escape";
5981 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005984 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005985 digits = 8;
5986 message = "truncated \\UXXXXXXXX escape";
5987 hexescape:
5988 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 if (s+digits>end) {
5990 endinpos = size;
5991 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 errors, &errorHandler,
5993 "unicodeescape", "end of string in escape sequence",
5994 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005995 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 goto onError;
5997 goto nextByte;
5998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005999 for (j = 0; j < digits; ++j) {
6000 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006001 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006002 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006003 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 errors, &errorHandler,
6005 "unicodeescape", message,
6006 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006007 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006008 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006009 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006011 }
6012 chr = (chr<<4) & ~0xF;
6013 if (c >= '0' && c <= '9')
6014 chr += c - '0';
6015 else if (c >= 'a' && c <= 'f')
6016 chr += 10 + c - 'a';
6017 else
6018 chr += 10 + c - 'A';
6019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006020 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006021 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 /* _decoding_error will have already written into the
6023 target buffer. */
6024 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006025 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006026 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006027 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006028 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006029 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006030 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006031 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 errors, &errorHandler,
6033 "unicodeescape", "illegal Unicode character",
6034 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006035 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006036 goto onError;
6037 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006038 break;
6039
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006041 case 'N':
6042 message = "malformed \\N character escape";
6043 if (ucnhash_CAPI == NULL) {
6044 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006045 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6046 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006047 if (ucnhash_CAPI == NULL)
6048 goto ucnhashError;
6049 }
6050 if (*s == '{') {
6051 const char *start = s+1;
6052 /* look for the closing brace */
6053 while (*s != '}' && s < end)
6054 s++;
6055 if (s > start && s < end && *s == '}') {
6056 /* found a name. look it up in the unicode database */
6057 message = "unknown Unicode character name";
6058 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006059 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006060 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006061 goto store;
6062 }
6063 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 errors, &errorHandler,
6067 "unicodeescape", message,
6068 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006069 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006070 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 break;
6072
6073 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006074 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075 message = "\\ at end of string";
6076 s--;
6077 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 errors, &errorHandler,
6080 "unicodeescape", message,
6081 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006082 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006083 goto onError;
6084 }
6085 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006086 WRITECHAR('\\');
6087 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006088 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006089 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006094#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006095
Victor Stinner16e6a802011-12-12 13:24:15 +01006096 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006097 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006098 Py_XDECREF(errorHandler);
6099 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006100 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006101
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006103 PyErr_SetString(
6104 PyExc_UnicodeError,
6105 "\\N escapes not supported (can't load unicodedata module)"
6106 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006107 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 Py_XDECREF(errorHandler);
6109 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006110 return NULL;
6111
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 Py_XDECREF(errorHandler);
6115 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 return NULL;
6117}
6118
6119/* Return a Unicode-Escape string version of the Unicode object.
6120
6121 If quotes is true, the string is enclosed in u"" or u'' quotes as
6122 appropriate.
6123
6124*/
6125
Alexander Belopolsky40018472011-02-26 01:02:56 +00006126PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006129 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006130 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006132 int kind;
6133 void *data;
6134 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Thomas Wouters89f507f2006-12-13 04:49:30 +00006136 /* Initial allocation is based on the longest-possible unichr
6137 escape.
6138
6139 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6140 unichr, so in this case it's the longest unichr escape. In
6141 narrow (UTF-16) builds this is five chars per source unichr
6142 since there are two unichrs in the surrogate pair, so in narrow
6143 (UTF-16) builds it's not the longest unichr escape.
6144
6145 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6146 so in the narrow (UTF-16) build case it's the longest unichr
6147 escape.
6148 */
6149
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150 if (!PyUnicode_Check(unicode)) {
6151 PyErr_BadArgument();
6152 return NULL;
6153 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006154 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 return NULL;
6156 len = PyUnicode_GET_LENGTH(unicode);
6157 kind = PyUnicode_KIND(unicode);
6158 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006159 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6161 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6162 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6163 }
6164
6165 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006166 return PyBytes_FromStringAndSize(NULL, 0);
6167
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006170
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006171 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 if (repr == NULL)
6176 return NULL;
6177
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006178 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006181 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006182
Walter Dörwald79e913e2007-05-12 11:08:06 +00006183 /* Escape backslashes */
6184 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 *p++ = '\\';
6186 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006187 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006188 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006189
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006190 /* Map 21-bit characters to '\U00xxxxxx' */
6191 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006192 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006193 *p++ = '\\';
6194 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006195 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6196 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6197 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6198 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6199 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6200 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6201 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6202 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006204 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006205
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006207 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 *p++ = '\\';
6209 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006210 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6211 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6212 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6213 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006215
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006216 /* Map special whitespace to '\t', \n', '\r' */
6217 else if (ch == '\t') {
6218 *p++ = '\\';
6219 *p++ = 't';
6220 }
6221 else if (ch == '\n') {
6222 *p++ = '\\';
6223 *p++ = 'n';
6224 }
6225 else if (ch == '\r') {
6226 *p++ = '\\';
6227 *p++ = 'r';
6228 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006229
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006230 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006231 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006233 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006234 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6235 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006236 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006237
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 /* Copy everything else as-is */
6239 else
6240 *p++ = (char) ch;
6241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006243 assert(p - PyBytes_AS_STRING(repr) > 0);
6244 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6245 return NULL;
6246 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247}
6248
Alexander Belopolsky40018472011-02-26 01:02:56 +00006249PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006250PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6251 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006253 PyObject *result;
6254 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6255 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006257 result = PyUnicode_AsUnicodeEscapeString(tmp);
6258 Py_DECREF(tmp);
6259 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260}
6261
6262/* --- Raw Unicode Escape Codec ------------------------------------------- */
6263
Alexander Belopolsky40018472011-02-26 01:02:56 +00006264PyObject *
6265PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006266 Py_ssize_t size,
6267 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006269 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006270 Py_ssize_t startinpos;
6271 Py_ssize_t endinpos;
6272 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006273 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 const char *end;
6275 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006276 PyObject *errorHandler = NULL;
6277 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006278
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 /* Escaped strings will always be longer than the resulting
6280 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 length after conversion to the true value. (But decoding error
6282 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006283 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006287 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006288 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 end = s + size;
6290 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 unsigned char c;
6292 Py_UCS4 x;
6293 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006294 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 /* Non-escape characters are interpreted as Unicode ordinals */
6297 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006298 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6299 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006301 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 startinpos = s-starts;
6303
6304 /* \u-escapes are only interpreted iff the number of leading
6305 backslashes if odd */
6306 bs = s;
6307 for (;s < end;) {
6308 if (*s != '\\')
6309 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006310 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6311 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 }
6313 if (((s - bs) & 1) == 0 ||
6314 s >= end ||
6315 (*s != 'u' && *s != 'U')) {
6316 continue;
6317 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006318 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 count = *s=='u' ? 4 : 8;
6320 s++;
6321
6322 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 for (x = 0, i = 0; i < count; ++i, ++s) {
6324 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006325 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 endinpos = s-starts;
6327 if (unicode_decode_call_errorhandler(
6328 errors, &errorHandler,
6329 "rawunicodeescape", "truncated \\uXXXX",
6330 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006331 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 goto onError;
6333 goto nextByte;
6334 }
6335 x = (x<<4) & ~0xF;
6336 if (c >= '0' && c <= '9')
6337 x += c - '0';
6338 else if (c >= 'a' && c <= 'f')
6339 x += 10 + c - 'a';
6340 else
6341 x += 10 + c - 'A';
6342 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006343 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006344 if (unicode_putchar(&v, &outpos, x) < 0)
6345 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006346 } else {
6347 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006348 if (unicode_decode_call_errorhandler(
6349 errors, &errorHandler,
6350 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006352 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006354 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 nextByte:
6356 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006358 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 Py_XDECREF(errorHandler);
6361 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006362 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006363
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006366 Py_XDECREF(errorHandler);
6367 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 return NULL;
6369}
6370
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006371
Alexander Belopolsky40018472011-02-26 01:02:56 +00006372PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006373PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006375 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 char *p;
6377 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006378 Py_ssize_t expandsize, pos;
6379 int kind;
6380 void *data;
6381 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006383 if (!PyUnicode_Check(unicode)) {
6384 PyErr_BadArgument();
6385 return NULL;
6386 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006387 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006388 return NULL;
6389 kind = PyUnicode_KIND(unicode);
6390 data = PyUnicode_DATA(unicode);
6391 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006392 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6393 bytes, and 1 byte characters 4. */
6394 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006395
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006396 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006398
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006399 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 if (repr == NULL)
6401 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006402 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006403 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006405 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 for (pos = 0; pos < len; pos++) {
6407 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 /* Map 32-bit characters to '\Uxxxxxxxx' */
6409 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006410 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006411 *p++ = '\\';
6412 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006413 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6414 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6415 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6416 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6417 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6418 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6419 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6420 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006421 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006423 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 *p++ = '\\';
6425 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006426 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6427 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6428 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6429 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 /* Copy everything else as-is */
6432 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 *p++ = (char) ch;
6434 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006435
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006436 assert(p > q);
6437 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006438 return NULL;
6439 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440}
6441
Alexander Belopolsky40018472011-02-26 01:02:56 +00006442PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006443PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006446 PyObject *result;
6447 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6448 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006449 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006450 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6451 Py_DECREF(tmp);
6452 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453}
6454
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006455/* --- Unicode Internal Codec ------------------------------------------- */
6456
Alexander Belopolsky40018472011-02-26 01:02:56 +00006457PyObject *
6458_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006459 Py_ssize_t size,
6460 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006461{
6462 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006463 Py_ssize_t startinpos;
6464 Py_ssize_t endinpos;
6465 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006466 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006467 const char *end;
6468 const char *reason;
6469 PyObject *errorHandler = NULL;
6470 PyObject *exc = NULL;
6471
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006472 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006473 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006474 1))
6475 return NULL;
6476
Thomas Wouters89f507f2006-12-13 04:49:30 +00006477 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006478 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006479 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006481 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006482 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006483 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006484 end = s + size;
6485
6486 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006487 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006488 Py_UCS4 ch;
6489 /* We copy the raw representation one byte at a time because the
6490 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006491 ((char *) &uch)[0] = s[0];
6492 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006493#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006494 ((char *) &uch)[2] = s[2];
6495 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006496#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006497 ch = uch;
6498
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006499 /* We have to sanity check the raw data, otherwise doom looms for
6500 some malformed UCS-4 data. */
6501 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006502#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006503 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006504#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006505 end-s < Py_UNICODE_SIZE
6506 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006508 startinpos = s - starts;
6509 if (end-s < Py_UNICODE_SIZE) {
6510 endinpos = end-starts;
6511 reason = "truncated input";
6512 }
6513 else {
6514 endinpos = s - starts + Py_UNICODE_SIZE;
6515 reason = "illegal code point (> 0x10FFFF)";
6516 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517 if (unicode_decode_call_errorhandler(
6518 errors, &errorHandler,
6519 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006520 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006521 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006522 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006523 continue;
6524 }
6525
6526 s += Py_UNICODE_SIZE;
6527#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006528 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006529 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006530 Py_UNICODE uch2;
6531 ((char *) &uch2)[0] = s[0];
6532 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006533 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006534 {
Victor Stinner551ac952011-11-29 22:58:13 +01006535 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006536 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 }
6538 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006539#endif
6540
6541 if (unicode_putchar(&v, &outpos, ch) < 0)
6542 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006543 }
6544
Victor Stinner16e6a802011-12-12 13:24:15 +01006545 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006546 goto onError;
6547 Py_XDECREF(errorHandler);
6548 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006549 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006550
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006552 Py_XDECREF(v);
6553 Py_XDECREF(errorHandler);
6554 Py_XDECREF(exc);
6555 return NULL;
6556}
6557
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558/* --- Latin-1 Codec ------------------------------------------------------ */
6559
Alexander Belopolsky40018472011-02-26 01:02:56 +00006560PyObject *
6561PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006562 Py_ssize_t size,
6563 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006566 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567}
6568
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006570static void
6571make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006572 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006573 PyObject *unicode,
6574 Py_ssize_t startpos, Py_ssize_t endpos,
6575 const char *reason)
6576{
6577 if (*exceptionObject == NULL) {
6578 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006579 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006580 encoding, unicode, startpos, endpos, reason);
6581 }
6582 else {
6583 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6584 goto onError;
6585 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6586 goto onError;
6587 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6588 goto onError;
6589 return;
6590 onError:
6591 Py_DECREF(*exceptionObject);
6592 *exceptionObject = NULL;
6593 }
6594}
6595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597static void
6598raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006599 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006600 PyObject *unicode,
6601 Py_ssize_t startpos, Py_ssize_t endpos,
6602 const char *reason)
6603{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006604 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006605 encoding, unicode, startpos, endpos, reason);
6606 if (*exceptionObject != NULL)
6607 PyCodec_StrictErrors(*exceptionObject);
6608}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006609
6610/* error handling callback helper:
6611 build arguments, call the callback and check the arguments,
6612 put the result into newpos and return the replacement string, which
6613 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006614static PyObject *
6615unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006616 PyObject **errorHandler,
6617 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006618 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006619 Py_ssize_t startpos, Py_ssize_t endpos,
6620 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006621{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006622 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006624 PyObject *restuple;
6625 PyObject *resunicode;
6626
6627 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006629 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006631 }
6632
Benjamin Petersonbac79492012-01-14 13:34:47 -05006633 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006634 return NULL;
6635 len = PyUnicode_GET_LENGTH(unicode);
6636
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006637 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006638 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641
6642 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006647 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 Py_DECREF(restuple);
6649 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006651 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 &resunicode, newpos)) {
6653 Py_DECREF(restuple);
6654 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006656 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6657 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6658 Py_DECREF(restuple);
6659 return NULL;
6660 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 *newpos = len + *newpos;
6663 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6665 Py_DECREF(restuple);
6666 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 Py_INCREF(resunicode);
6669 Py_DECREF(restuple);
6670 return resunicode;
6671}
6672
Alexander Belopolsky40018472011-02-26 01:02:56 +00006673static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006675 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006676 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 /* input state */
6679 Py_ssize_t pos=0, size;
6680 int kind;
6681 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682 /* output object */
6683 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 /* pointer into the output */
6685 char *str;
6686 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006687 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006688 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6689 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 PyObject *errorHandler = NULL;
6691 PyObject *exc = NULL;
6692 /* the following variable is used for caching string comparisons
6693 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6694 int known_errorHandler = -1;
6695
Benjamin Petersonbac79492012-01-14 13:34:47 -05006696 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006697 return NULL;
6698 size = PyUnicode_GET_LENGTH(unicode);
6699 kind = PyUnicode_KIND(unicode);
6700 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 /* allocate enough for a simple encoding without
6702 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006703 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006704 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006705 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006707 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006708 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 ressize = size;
6710
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006711 while (pos < size) {
6712 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 /* can we encode this? */
6715 if (c<limit) {
6716 /* no overflow check, because we know that the space is enough */
6717 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006719 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 Py_ssize_t requiredsize;
6722 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006723 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006725 Py_ssize_t collstart = pos;
6726 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006728 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 ++collend;
6730 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6731 if (known_errorHandler==-1) {
6732 if ((errors==NULL) || (!strcmp(errors, "strict")))
6733 known_errorHandler = 1;
6734 else if (!strcmp(errors, "replace"))
6735 known_errorHandler = 2;
6736 else if (!strcmp(errors, "ignore"))
6737 known_errorHandler = 3;
6738 else if (!strcmp(errors, "xmlcharrefreplace"))
6739 known_errorHandler = 4;
6740 else
6741 known_errorHandler = 0;
6742 }
6743 switch (known_errorHandler) {
6744 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006745 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 goto onError;
6747 case 2: /* replace */
6748 while (collstart++<collend)
6749 *str++ = '?'; /* fall through */
6750 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006751 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 break;
6753 case 4: /* xmlcharrefreplace */
6754 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006755 /* determine replacement size */
6756 for (i = collstart, repsize = 0; i < collend; ++i) {
6757 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6758 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006760 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006762 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006766 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006770 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006771 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006773 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006775 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 if (requiredsize > ressize) {
6777 if (requiredsize<2*ressize)
6778 requiredsize = 2*ressize;
6779 if (_PyBytes_Resize(&res, requiredsize))
6780 goto onError;
6781 str = PyBytes_AS_STRING(res) + respos;
6782 ressize = requiredsize;
6783 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 /* generate replacement */
6785 for (i = collstart; i < collend; ++i) {
6786 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 break;
6790 default:
6791 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006792 encoding, reason, unicode, &exc,
6793 collstart, collend, &newpos);
6794 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006795 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006797 if (PyBytes_Check(repunicode)) {
6798 /* Directly copy bytes result to output. */
6799 repsize = PyBytes_Size(repunicode);
6800 if (repsize > 1) {
6801 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006802 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006803 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6804 Py_DECREF(repunicode);
6805 goto onError;
6806 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006807 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006808 ressize += repsize-1;
6809 }
6810 memcpy(str, PyBytes_AsString(repunicode), repsize);
6811 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006812 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006813 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006814 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006815 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 /* need more space? (at least enough for what we
6817 have+the replacement+the rest of the string, so
6818 we won't have to check space for encodable characters) */
6819 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006820 repsize = PyUnicode_GET_LENGTH(repunicode);
6821 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 if (requiredsize > ressize) {
6823 if (requiredsize<2*ressize)
6824 requiredsize = 2*ressize;
6825 if (_PyBytes_Resize(&res, requiredsize)) {
6826 Py_DECREF(repunicode);
6827 goto onError;
6828 }
6829 str = PyBytes_AS_STRING(res) + respos;
6830 ressize = requiredsize;
6831 }
6832 /* check if there is anything unencodable in the replacement
6833 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006834 for (i = 0; repsize-->0; ++i, ++str) {
6835 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006837 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006838 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 Py_DECREF(repunicode);
6840 goto onError;
6841 }
6842 *str = (char)c;
6843 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006844 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006845 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006846 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006847 }
6848 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006849 /* Resize if we allocated to much */
6850 size = str - PyBytes_AS_STRING(res);
6851 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006852 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006853 if (_PyBytes_Resize(&res, size) < 0)
6854 goto onError;
6855 }
6856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 Py_XDECREF(errorHandler);
6858 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006859 return res;
6860
6861 onError:
6862 Py_XDECREF(res);
6863 Py_XDECREF(errorHandler);
6864 Py_XDECREF(exc);
6865 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006866}
6867
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006868/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006869PyObject *
6870PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006871 Py_ssize_t size,
6872 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006874 PyObject *result;
6875 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6876 if (unicode == NULL)
6877 return NULL;
6878 result = unicode_encode_ucs1(unicode, errors, 256);
6879 Py_DECREF(unicode);
6880 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881}
6882
Alexander Belopolsky40018472011-02-26 01:02:56 +00006883PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006884_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885{
6886 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 PyErr_BadArgument();
6888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006890 if (PyUnicode_READY(unicode) == -1)
6891 return NULL;
6892 /* Fast path: if it is a one-byte string, construct
6893 bytes object directly. */
6894 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6895 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6896 PyUnicode_GET_LENGTH(unicode));
6897 /* Non-Latin-1 characters present. Defer to above function to
6898 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006899 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006900}
6901
6902PyObject*
6903PyUnicode_AsLatin1String(PyObject *unicode)
6904{
6905 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906}
6907
6908/* --- 7-bit ASCII Codec -------------------------------------------------- */
6909
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910PyObject *
6911PyUnicode_DecodeASCII(const char *s,
6912 Py_ssize_t size,
6913 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006915 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006916 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006917 int kind;
6918 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006919 Py_ssize_t startinpos;
6920 Py_ssize_t endinpos;
6921 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006923 int has_error;
6924 const unsigned char *p = (const unsigned char *)s;
6925 const unsigned char *end = p + size;
6926 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006927 PyObject *errorHandler = NULL;
6928 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006929
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006930 if (size == 0) {
6931 Py_INCREF(unicode_empty);
6932 return unicode_empty;
6933 }
6934
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006936 if (size == 1 && (unsigned char)s[0] < 128)
6937 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006938
Victor Stinner702c7342011-10-05 13:50:52 +02006939 has_error = 0;
6940 while (p < end && !has_error) {
6941 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6942 an explanation. */
6943 if (!((size_t) p & LONG_PTR_MASK)) {
6944 /* Help register allocation */
6945 register const unsigned char *_p = p;
6946 while (_p < aligned_end) {
6947 unsigned long value = *(unsigned long *) _p;
6948 if (value & ASCII_CHAR_MASK) {
6949 has_error = 1;
6950 break;
6951 }
6952 _p += SIZEOF_LONG;
6953 }
6954 if (_p == end)
6955 break;
6956 if (has_error)
6957 break;
6958 p = _p;
6959 }
6960 if (*p & 0x80) {
6961 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006962 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006963 }
6964 else {
6965 ++p;
6966 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006967 }
Victor Stinner702c7342011-10-05 13:50:52 +02006968 if (!has_error)
6969 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006970
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006971 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006975 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006976 kind = PyUnicode_KIND(v);
6977 data = PyUnicode_DATA(v);
6978 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006979 e = s + size;
6980 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 register unsigned char c = (unsigned char)*s;
6982 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006983 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 ++s;
6985 }
6986 else {
6987 startinpos = s-starts;
6988 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 if (unicode_decode_call_errorhandler(
6990 errors, &errorHandler,
6991 "ascii", "ordinal not in range(128)",
6992 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006993 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006995 kind = PyUnicode_KIND(v);
6996 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006999 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007000 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001 Py_XDECREF(errorHandler);
7002 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007003 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007004 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007005
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007008 Py_XDECREF(errorHandler);
7009 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 return NULL;
7011}
7012
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007013/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007014PyObject *
7015PyUnicode_EncodeASCII(const Py_UNICODE *p,
7016 Py_ssize_t size,
7017 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007019 PyObject *result;
7020 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7021 if (unicode == NULL)
7022 return NULL;
7023 result = unicode_encode_ucs1(unicode, errors, 128);
7024 Py_DECREF(unicode);
7025 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026}
7027
Alexander Belopolsky40018472011-02-26 01:02:56 +00007028PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007029_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030{
7031 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 PyErr_BadArgument();
7033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007035 if (PyUnicode_READY(unicode) == -1)
7036 return NULL;
7037 /* Fast path: if it is an ASCII-only string, construct bytes object
7038 directly. Else defer to above function to raise the exception. */
7039 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7040 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7041 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007042 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007043}
7044
7045PyObject *
7046PyUnicode_AsASCIIString(PyObject *unicode)
7047{
7048 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049}
7050
Victor Stinner99b95382011-07-04 14:23:54 +02007051#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007052
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007053/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007054
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007055#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056#define NEED_RETRY
7057#endif
7058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059#ifndef WC_ERR_INVALID_CHARS
7060# define WC_ERR_INVALID_CHARS 0x0080
7061#endif
7062
7063static char*
7064code_page_name(UINT code_page, PyObject **obj)
7065{
7066 *obj = NULL;
7067 if (code_page == CP_ACP)
7068 return "mbcs";
7069 if (code_page == CP_UTF7)
7070 return "CP_UTF7";
7071 if (code_page == CP_UTF8)
7072 return "CP_UTF8";
7073
7074 *obj = PyBytes_FromFormat("cp%u", code_page);
7075 if (*obj == NULL)
7076 return NULL;
7077 return PyBytes_AS_STRING(*obj);
7078}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079
Alexander Belopolsky40018472011-02-26 01:02:56 +00007080static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007081is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082{
7083 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007084 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 if (!IsDBCSLeadByteEx(code_page, *curr))
7087 return 0;
7088
7089 prev = CharPrevExA(code_page, s, curr, 0);
7090 if (prev == curr)
7091 return 1;
7092 /* FIXME: This code is limited to "true" double-byte encodings,
7093 as it assumes an incomplete character consists of a single
7094 byte. */
7095 if (curr - prev == 2)
7096 return 1;
7097 if (!IsDBCSLeadByteEx(code_page, *prev))
7098 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099 return 0;
7100}
7101
Victor Stinner3a50e702011-10-18 21:21:00 +02007102static DWORD
7103decode_code_page_flags(UINT code_page)
7104{
7105 if (code_page == CP_UTF7) {
7106 /* The CP_UTF7 decoder only supports flags=0 */
7107 return 0;
7108 }
7109 else
7110 return MB_ERR_INVALID_CHARS;
7111}
7112
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 * Decode a byte string from a Windows code page into unicode object in strict
7115 * mode.
7116 *
7117 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7118 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007120static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007121decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007122 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007123 const char *in,
7124 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125{
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007127 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129
7130 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 assert(insize > 0);
7132 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7133 if (outsize <= 0)
7134 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007135
7136 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007138 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007139 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007140 if (*v == NULL)
7141 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 }
7144 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007145 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007147 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007149 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150 }
7151
7152 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007153 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7154 if (outsize <= 0)
7155 goto error;
7156 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007157
Victor Stinner3a50e702011-10-18 21:21:00 +02007158error:
7159 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7160 return -2;
7161 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007162 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007163}
7164
Victor Stinner3a50e702011-10-18 21:21:00 +02007165/*
7166 * Decode a byte string from a code page into unicode object with an error
7167 * handler.
7168 *
7169 * Returns consumed size if succeed, or raise a WindowsError or
7170 * UnicodeDecodeError exception and returns -1 on error.
7171 */
7172static int
7173decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007174 PyObject **v,
7175 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 const char *errors)
7177{
7178 const char *startin = in;
7179 const char *endin = in + size;
7180 const DWORD flags = decode_code_page_flags(code_page);
7181 /* Ideally, we should get reason from FormatMessage. This is the Windows
7182 2000 English version of the message. */
7183 const char *reason = "No mapping for the Unicode character exists "
7184 "in the target code page.";
7185 /* each step cannot decode more than 1 character, but a character can be
7186 represented as a surrogate pair */
7187 wchar_t buffer[2], *startout, *out;
7188 int insize, outsize;
7189 PyObject *errorHandler = NULL;
7190 PyObject *exc = NULL;
7191 PyObject *encoding_obj = NULL;
7192 char *encoding;
7193 DWORD err;
7194 int ret = -1;
7195
7196 assert(size > 0);
7197
7198 encoding = code_page_name(code_page, &encoding_obj);
7199 if (encoding == NULL)
7200 return -1;
7201
7202 if (errors == NULL || strcmp(errors, "strict") == 0) {
7203 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7204 UnicodeDecodeError. */
7205 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7206 if (exc != NULL) {
7207 PyCodec_StrictErrors(exc);
7208 Py_CLEAR(exc);
7209 }
7210 goto error;
7211 }
7212
7213 if (*v == NULL) {
7214 /* Create unicode object */
7215 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7216 PyErr_NoMemory();
7217 goto error;
7218 }
Victor Stinnerab595942011-12-17 04:59:06 +01007219 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007220 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 if (*v == NULL)
7222 goto error;
7223 startout = PyUnicode_AS_UNICODE(*v);
7224 }
7225 else {
7226 /* Extend unicode object */
7227 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7228 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7229 PyErr_NoMemory();
7230 goto error;
7231 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007232 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 goto error;
7234 startout = PyUnicode_AS_UNICODE(*v) + n;
7235 }
7236
7237 /* Decode the byte string character per character */
7238 out = startout;
7239 while (in < endin)
7240 {
7241 /* Decode a character */
7242 insize = 1;
7243 do
7244 {
7245 outsize = MultiByteToWideChar(code_page, flags,
7246 in, insize,
7247 buffer, Py_ARRAY_LENGTH(buffer));
7248 if (outsize > 0)
7249 break;
7250 err = GetLastError();
7251 if (err != ERROR_NO_UNICODE_TRANSLATION
7252 && err != ERROR_INSUFFICIENT_BUFFER)
7253 {
7254 PyErr_SetFromWindowsErr(0);
7255 goto error;
7256 }
7257 insize++;
7258 }
7259 /* 4=maximum length of a UTF-8 sequence */
7260 while (insize <= 4 && (in + insize) <= endin);
7261
7262 if (outsize <= 0) {
7263 Py_ssize_t startinpos, endinpos, outpos;
7264
7265 startinpos = in - startin;
7266 endinpos = startinpos + 1;
7267 outpos = out - PyUnicode_AS_UNICODE(*v);
7268 if (unicode_decode_call_errorhandler(
7269 errors, &errorHandler,
7270 encoding, reason,
7271 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007272 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 {
7274 goto error;
7275 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007276 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 }
7278 else {
7279 in += insize;
7280 memcpy(out, buffer, outsize * sizeof(wchar_t));
7281 out += outsize;
7282 }
7283 }
7284
7285 /* write a NUL character at the end */
7286 *out = 0;
7287
7288 /* Extend unicode object */
7289 outsize = out - startout;
7290 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007291 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007292 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007293 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007294
7295error:
7296 Py_XDECREF(encoding_obj);
7297 Py_XDECREF(errorHandler);
7298 Py_XDECREF(exc);
7299 return ret;
7300}
7301
Victor Stinner3a50e702011-10-18 21:21:00 +02007302static PyObject *
7303decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007304 const char *s, Py_ssize_t size,
7305 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007306{
Victor Stinner76a31a62011-11-04 00:05:13 +01007307 PyObject *v = NULL;
7308 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 if (code_page < 0) {
7311 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7312 return NULL;
7313 }
7314
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317
Victor Stinner76a31a62011-11-04 00:05:13 +01007318 do
7319 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 if (size > INT_MAX) {
7322 chunk_size = INT_MAX;
7323 final = 0;
7324 done = 0;
7325 }
7326 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007328 {
7329 chunk_size = (int)size;
7330 final = (consumed == NULL);
7331 done = 1;
7332 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333
Victor Stinner76a31a62011-11-04 00:05:13 +01007334 /* Skip trailing lead-byte unless 'final' is set */
7335 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7336 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 if (chunk_size == 0 && done) {
7339 if (v != NULL)
7340 break;
7341 Py_INCREF(unicode_empty);
7342 return unicode_empty;
7343 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344
Victor Stinner76a31a62011-11-04 00:05:13 +01007345
7346 converted = decode_code_page_strict(code_page, &v,
7347 s, chunk_size);
7348 if (converted == -2)
7349 converted = decode_code_page_errors(code_page, &v,
7350 s, chunk_size,
7351 errors);
7352 assert(converted != 0);
7353
7354 if (converted < 0) {
7355 Py_XDECREF(v);
7356 return NULL;
7357 }
7358
7359 if (consumed)
7360 *consumed += converted;
7361
7362 s += converted;
7363 size -= converted;
7364 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007365
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007366 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007367}
7368
Alexander Belopolsky40018472011-02-26 01:02:56 +00007369PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007370PyUnicode_DecodeCodePageStateful(int code_page,
7371 const char *s,
7372 Py_ssize_t size,
7373 const char *errors,
7374 Py_ssize_t *consumed)
7375{
7376 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7377}
7378
7379PyObject *
7380PyUnicode_DecodeMBCSStateful(const char *s,
7381 Py_ssize_t size,
7382 const char *errors,
7383 Py_ssize_t *consumed)
7384{
7385 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7386}
7387
7388PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007389PyUnicode_DecodeMBCS(const char *s,
7390 Py_ssize_t size,
7391 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007392{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007393 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7394}
7395
Victor Stinner3a50e702011-10-18 21:21:00 +02007396static DWORD
7397encode_code_page_flags(UINT code_page, const char *errors)
7398{
7399 if (code_page == CP_UTF8) {
7400 if (winver.dwMajorVersion >= 6)
7401 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7402 and later */
7403 return WC_ERR_INVALID_CHARS;
7404 else
7405 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7406 return 0;
7407 }
7408 else if (code_page == CP_UTF7) {
7409 /* CP_UTF7 only supports flags=0 */
7410 return 0;
7411 }
7412 else {
7413 if (errors != NULL && strcmp(errors, "replace") == 0)
7414 return 0;
7415 else
7416 return WC_NO_BEST_FIT_CHARS;
7417 }
7418}
7419
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007420/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 * Encode a Unicode string to a Windows code page into a byte string in strict
7422 * mode.
7423 *
7424 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7425 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007426 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007427static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007428encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007431{
Victor Stinner554f3f02010-06-16 23:33:54 +00007432 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 BOOL *pusedDefaultChar = &usedDefaultChar;
7434 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007435 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007436 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 const DWORD flags = encode_code_page_flags(code_page, NULL);
7439 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007440 /* Create a substring so that we can get the UTF-16 representation
7441 of just the slice under consideration. */
7442 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007443
Martin v. Löwis3d325192011-11-04 18:23:06 +01007444 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007445
Victor Stinner3a50e702011-10-18 21:21:00 +02007446 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007447 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007449 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007450
Victor Stinner2fc507f2011-11-04 20:06:39 +01007451 substring = PyUnicode_Substring(unicode, offset, offset+len);
7452 if (substring == NULL)
7453 return -1;
7454 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7455 if (p == NULL) {
7456 Py_DECREF(substring);
7457 return -1;
7458 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007459
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007460 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 outsize = WideCharToMultiByte(code_page, flags,
7462 p, size,
7463 NULL, 0,
7464 NULL, pusedDefaultChar);
7465 if (outsize <= 0)
7466 goto error;
7467 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 if (pusedDefaultChar && *pusedDefaultChar) {
7469 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007472
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 if (*outbytes == NULL) {
7477 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007481 }
7482 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 const Py_ssize_t n = PyBytes_Size(*outbytes);
7485 if (outsize > PY_SSIZE_T_MAX - n) {
7486 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007487 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007490 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7491 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007495 }
7496
7497 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 outsize = WideCharToMultiByte(code_page, flags,
7499 p, size,
7500 out, outsize,
7501 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 if (outsize <= 0)
7504 goto error;
7505 if (pusedDefaultChar && *pusedDefaultChar)
7506 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007507 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007508
Victor Stinner3a50e702011-10-18 21:21:00 +02007509error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007510 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7512 return -2;
7513 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007514 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007515}
7516
Victor Stinner3a50e702011-10-18 21:21:00 +02007517/*
7518 * Encode a Unicode string to a Windows code page into a byte string using a
7519 * error handler.
7520 *
7521 * Returns consumed characters if succeed, or raise a WindowsError and returns
7522 * -1 on other error.
7523 */
7524static int
7525encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007526 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007527 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007528{
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007530 Py_ssize_t pos = unicode_offset;
7531 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 /* Ideally, we should get reason from FormatMessage. This is the Windows
7533 2000 English version of the message. */
7534 const char *reason = "invalid character";
7535 /* 4=maximum length of a UTF-8 sequence */
7536 char buffer[4];
7537 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7538 Py_ssize_t outsize;
7539 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 PyObject *errorHandler = NULL;
7541 PyObject *exc = NULL;
7542 PyObject *encoding_obj = NULL;
7543 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007544 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 PyObject *rep;
7546 int ret = -1;
7547
7548 assert(insize > 0);
7549
7550 encoding = code_page_name(code_page, &encoding_obj);
7551 if (encoding == NULL)
7552 return -1;
7553
7554 if (errors == NULL || strcmp(errors, "strict") == 0) {
7555 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7556 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007557 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 if (exc != NULL) {
7559 PyCodec_StrictErrors(exc);
7560 Py_DECREF(exc);
7561 }
7562 Py_XDECREF(encoding_obj);
7563 return -1;
7564 }
7565
7566 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7567 pusedDefaultChar = &usedDefaultChar;
7568 else
7569 pusedDefaultChar = NULL;
7570
7571 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7572 PyErr_NoMemory();
7573 goto error;
7574 }
7575 outsize = insize * Py_ARRAY_LENGTH(buffer);
7576
7577 if (*outbytes == NULL) {
7578 /* Create string object */
7579 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7580 if (*outbytes == NULL)
7581 goto error;
7582 out = PyBytes_AS_STRING(*outbytes);
7583 }
7584 else {
7585 /* Extend string object */
7586 Py_ssize_t n = PyBytes_Size(*outbytes);
7587 if (n > PY_SSIZE_T_MAX - outsize) {
7588 PyErr_NoMemory();
7589 goto error;
7590 }
7591 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7592 goto error;
7593 out = PyBytes_AS_STRING(*outbytes) + n;
7594 }
7595
7596 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007597 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007599 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7600 wchar_t chars[2];
7601 int charsize;
7602 if (ch < 0x10000) {
7603 chars[0] = (wchar_t)ch;
7604 charsize = 1;
7605 }
7606 else {
7607 ch -= 0x10000;
7608 chars[0] = 0xd800 + (ch >> 10);
7609 chars[1] = 0xdc00 + (ch & 0x3ff);
7610 charsize = 2;
7611 }
7612
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007614 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 buffer, Py_ARRAY_LENGTH(buffer),
7616 NULL, pusedDefaultChar);
7617 if (outsize > 0) {
7618 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7619 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 memcpy(out, buffer, outsize);
7622 out += outsize;
7623 continue;
7624 }
7625 }
7626 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7627 PyErr_SetFromWindowsErr(0);
7628 goto error;
7629 }
7630
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 rep = unicode_encode_call_errorhandler(
7632 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007633 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007634 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007635 if (rep == NULL)
7636 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007637 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007638
7639 if (PyBytes_Check(rep)) {
7640 outsize = PyBytes_GET_SIZE(rep);
7641 if (outsize != 1) {
7642 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7643 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7644 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7645 Py_DECREF(rep);
7646 goto error;
7647 }
7648 out = PyBytes_AS_STRING(*outbytes) + offset;
7649 }
7650 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7651 out += outsize;
7652 }
7653 else {
7654 Py_ssize_t i;
7655 enum PyUnicode_Kind kind;
7656 void *data;
7657
Benjamin Petersonbac79492012-01-14 13:34:47 -05007658 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007659 Py_DECREF(rep);
7660 goto error;
7661 }
7662
7663 outsize = PyUnicode_GET_LENGTH(rep);
7664 if (outsize != 1) {
7665 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7666 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7667 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7668 Py_DECREF(rep);
7669 goto error;
7670 }
7671 out = PyBytes_AS_STRING(*outbytes) + offset;
7672 }
7673 kind = PyUnicode_KIND(rep);
7674 data = PyUnicode_DATA(rep);
7675 for (i=0; i < outsize; i++) {
7676 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7677 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007678 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007679 encoding, unicode,
7680 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007681 "unable to encode error handler result to ASCII");
7682 Py_DECREF(rep);
7683 goto error;
7684 }
7685 *out = (unsigned char)ch;
7686 out++;
7687 }
7688 }
7689 Py_DECREF(rep);
7690 }
7691 /* write a NUL byte */
7692 *out = 0;
7693 outsize = out - PyBytes_AS_STRING(*outbytes);
7694 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7695 if (_PyBytes_Resize(outbytes, outsize) < 0)
7696 goto error;
7697 ret = 0;
7698
7699error:
7700 Py_XDECREF(encoding_obj);
7701 Py_XDECREF(errorHandler);
7702 Py_XDECREF(exc);
7703 return ret;
7704}
7705
Victor Stinner3a50e702011-10-18 21:21:00 +02007706static PyObject *
7707encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007708 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 const char *errors)
7710{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007711 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007712 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007713 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007714 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007715
Benjamin Petersonbac79492012-01-14 13:34:47 -05007716 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007717 return NULL;
7718 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007719
Victor Stinner3a50e702011-10-18 21:21:00 +02007720 if (code_page < 0) {
7721 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7722 return NULL;
7723 }
7724
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 return PyBytes_FromStringAndSize(NULL, 0);
7727
Victor Stinner7581cef2011-11-03 22:32:33 +01007728 offset = 0;
7729 do
7730 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007731#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007732 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007733 chunks. */
7734 if (len > INT_MAX/2) {
7735 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007736 done = 0;
7737 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007738 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007739#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007741 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007742 done = 1;
7743 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007744
Victor Stinner76a31a62011-11-04 00:05:13 +01007745 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007747 errors);
7748 if (ret == -2)
7749 ret = encode_code_page_errors(code_page, &outbytes,
7750 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007752 if (ret < 0) {
7753 Py_XDECREF(outbytes);
7754 return NULL;
7755 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007756
Victor Stinner7581cef2011-11-03 22:32:33 +01007757 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007758 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007759 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007760
Victor Stinner3a50e702011-10-18 21:21:00 +02007761 return outbytes;
7762}
7763
7764PyObject *
7765PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7766 Py_ssize_t size,
7767 const char *errors)
7768{
Victor Stinner7581cef2011-11-03 22:32:33 +01007769 PyObject *unicode, *res;
7770 unicode = PyUnicode_FromUnicode(p, size);
7771 if (unicode == NULL)
7772 return NULL;
7773 res = encode_code_page(CP_ACP, unicode, errors);
7774 Py_DECREF(unicode);
7775 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007776}
7777
7778PyObject *
7779PyUnicode_EncodeCodePage(int code_page,
7780 PyObject *unicode,
7781 const char *errors)
7782{
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007784}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007785
Alexander Belopolsky40018472011-02-26 01:02:56 +00007786PyObject *
7787PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007788{
7789 if (!PyUnicode_Check(unicode)) {
7790 PyErr_BadArgument();
7791 return NULL;
7792 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007793 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007794}
7795
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007796#undef NEED_RETRY
7797
Victor Stinner99b95382011-07-04 14:23:54 +02007798#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007799
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800/* --- Character Mapping Codec -------------------------------------------- */
7801
Alexander Belopolsky40018472011-02-26 01:02:56 +00007802PyObject *
7803PyUnicode_DecodeCharmap(const char *s,
7804 Py_ssize_t size,
7805 PyObject *mapping,
7806 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007808 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007809 Py_ssize_t startinpos;
7810 Py_ssize_t endinpos;
7811 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007813 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007814 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 PyObject *errorHandler = NULL;
7816 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007817
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818 /* Default to Latin-1 */
7819 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007822 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007826 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007827 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007828 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007829 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007830 Py_ssize_t maplen;
7831 enum PyUnicode_Kind kind;
7832 void *data;
7833 Py_UCS4 x;
7834
Benjamin Petersonbac79492012-01-14 13:34:47 -05007835 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007836 return NULL;
7837
7838 maplen = PyUnicode_GET_LENGTH(mapping);
7839 data = PyUnicode_DATA(mapping);
7840 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 while (s < e) {
7842 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007845 x = PyUnicode_READ(kind, data, ch);
7846 else
7847 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007849 if (x == 0xfffe)
7850 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 startinpos = s-starts;
7853 endinpos = startinpos+1;
7854 if (unicode_decode_call_errorhandler(
7855 errors, &errorHandler,
7856 "charmap", "character maps to <undefined>",
7857 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007858 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 goto onError;
7860 }
7861 continue;
7862 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007863
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007864 if (unicode_putchar(&v, &outpos, x) < 0)
7865 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007867 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007868 }
7869 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 while (s < e) {
7871 unsigned char ch = *s;
7872 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007873
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7875 w = PyLong_FromLong((long)ch);
7876 if (w == NULL)
7877 goto onError;
7878 x = PyObject_GetItem(mapping, w);
7879 Py_DECREF(w);
7880 if (x == NULL) {
7881 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7882 /* No mapping found means: mapping is undefined. */
7883 PyErr_Clear();
7884 x = Py_None;
7885 Py_INCREF(x);
7886 } else
7887 goto onError;
7888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007889
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 /* Apply mapping */
7891 if (PyLong_Check(x)) {
7892 long value = PyLong_AS_LONG(x);
7893 if (value < 0 || value > 65535) {
7894 PyErr_SetString(PyExc_TypeError,
7895 "character mapping must be in range(65536)");
7896 Py_DECREF(x);
7897 goto onError;
7898 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007899 if (unicode_putchar(&v, &outpos, value) < 0)
7900 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 }
7902 else if (x == Py_None) {
7903 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 startinpos = s-starts;
7905 endinpos = startinpos+1;
7906 if (unicode_decode_call_errorhandler(
7907 errors, &errorHandler,
7908 "charmap", "character maps to <undefined>",
7909 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007910 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 Py_DECREF(x);
7912 goto onError;
7913 }
7914 Py_DECREF(x);
7915 continue;
7916 }
7917 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007918 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007919
Benjamin Petersonbac79492012-01-14 13:34:47 -05007920 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007921 goto onError;
7922 targetsize = PyUnicode_GET_LENGTH(x);
7923
7924 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007926 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007927 PyUnicode_READ_CHAR(x, 0)) < 0)
7928 goto onError;
7929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 else if (targetsize > 1) {
7931 /* 1-n mapping */
7932 if (targetsize > extrachars) {
7933 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 Py_ssize_t needed = (targetsize - extrachars) + \
7935 (targetsize << 2);
7936 extrachars += needed;
7937 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007938 if (unicode_resize(&v,
7939 PyUnicode_GET_LENGTH(v) + needed) < 0)
7940 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 Py_DECREF(x);
7942 goto onError;
7943 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007945 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7946 goto onError;
7947 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7948 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 extrachars -= targetsize;
7950 }
7951 /* 1-0 mapping: skip the character */
7952 }
7953 else {
7954 /* wrong return value */
7955 PyErr_SetString(PyExc_TypeError,
7956 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007957 Py_DECREF(x);
7958 goto onError;
7959 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 Py_DECREF(x);
7961 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007964 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966 Py_XDECREF(errorHandler);
7967 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007968 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007969
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007971 Py_XDECREF(errorHandler);
7972 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 Py_XDECREF(v);
7974 return NULL;
7975}
7976
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007977/* Charmap encoding: the lookup table */
7978
Alexander Belopolsky40018472011-02-26 01:02:56 +00007979struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 PyObject_HEAD
7981 unsigned char level1[32];
7982 int count2, count3;
7983 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007984};
7985
7986static PyObject*
7987encoding_map_size(PyObject *obj, PyObject* args)
7988{
7989 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007990 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007992}
7993
7994static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007995 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 PyDoc_STR("Return the size (in bytes) of this object") },
7997 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007998};
7999
8000static void
8001encoding_map_dealloc(PyObject* o)
8002{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008003 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004}
8005
8006static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008007 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 "EncodingMap", /*tp_name*/
8009 sizeof(struct encoding_map), /*tp_basicsize*/
8010 0, /*tp_itemsize*/
8011 /* methods */
8012 encoding_map_dealloc, /*tp_dealloc*/
8013 0, /*tp_print*/
8014 0, /*tp_getattr*/
8015 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008016 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 0, /*tp_repr*/
8018 0, /*tp_as_number*/
8019 0, /*tp_as_sequence*/
8020 0, /*tp_as_mapping*/
8021 0, /*tp_hash*/
8022 0, /*tp_call*/
8023 0, /*tp_str*/
8024 0, /*tp_getattro*/
8025 0, /*tp_setattro*/
8026 0, /*tp_as_buffer*/
8027 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8028 0, /*tp_doc*/
8029 0, /*tp_traverse*/
8030 0, /*tp_clear*/
8031 0, /*tp_richcompare*/
8032 0, /*tp_weaklistoffset*/
8033 0, /*tp_iter*/
8034 0, /*tp_iternext*/
8035 encoding_map_methods, /*tp_methods*/
8036 0, /*tp_members*/
8037 0, /*tp_getset*/
8038 0, /*tp_base*/
8039 0, /*tp_dict*/
8040 0, /*tp_descr_get*/
8041 0, /*tp_descr_set*/
8042 0, /*tp_dictoffset*/
8043 0, /*tp_init*/
8044 0, /*tp_alloc*/
8045 0, /*tp_new*/
8046 0, /*tp_free*/
8047 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048};
8049
8050PyObject*
8051PyUnicode_BuildEncodingMap(PyObject* string)
8052{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053 PyObject *result;
8054 struct encoding_map *mresult;
8055 int i;
8056 int need_dict = 0;
8057 unsigned char level1[32];
8058 unsigned char level2[512];
8059 unsigned char *mlevel1, *mlevel2, *mlevel3;
8060 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 int kind;
8062 void *data;
8063 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008065 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066 PyErr_BadArgument();
8067 return NULL;
8068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008069 kind = PyUnicode_KIND(string);
8070 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071 memset(level1, 0xFF, sizeof level1);
8072 memset(level2, 0xFF, sizeof level2);
8073
8074 /* If there isn't a one-to-one mapping of NULL to \0,
8075 or if there are non-BMP characters, we need to use
8076 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008077 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078 need_dict = 1;
8079 for (i = 1; i < 256; i++) {
8080 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081 ch = PyUnicode_READ(kind, data, i);
8082 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 need_dict = 1;
8084 break;
8085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008086 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087 /* unmapped character */
8088 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 l1 = ch >> 11;
8090 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091 if (level1[l1] == 0xFF)
8092 level1[l1] = count2++;
8093 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008094 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095 }
8096
8097 if (count2 >= 0xFF || count3 >= 0xFF)
8098 need_dict = 1;
8099
8100 if (need_dict) {
8101 PyObject *result = PyDict_New();
8102 PyObject *key, *value;
8103 if (!result)
8104 return NULL;
8105 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008106 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008107 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 if (!key || !value)
8109 goto failed1;
8110 if (PyDict_SetItem(result, key, value) == -1)
8111 goto failed1;
8112 Py_DECREF(key);
8113 Py_DECREF(value);
8114 }
8115 return result;
8116 failed1:
8117 Py_XDECREF(key);
8118 Py_XDECREF(value);
8119 Py_DECREF(result);
8120 return NULL;
8121 }
8122
8123 /* Create a three-level trie */
8124 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8125 16*count2 + 128*count3 - 1);
8126 if (!result)
8127 return PyErr_NoMemory();
8128 PyObject_Init(result, &EncodingMapType);
8129 mresult = (struct encoding_map*)result;
8130 mresult->count2 = count2;
8131 mresult->count3 = count3;
8132 mlevel1 = mresult->level1;
8133 mlevel2 = mresult->level23;
8134 mlevel3 = mresult->level23 + 16*count2;
8135 memcpy(mlevel1, level1, 32);
8136 memset(mlevel2, 0xFF, 16*count2);
8137 memset(mlevel3, 0, 128*count3);
8138 count3 = 0;
8139 for (i = 1; i < 256; i++) {
8140 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 /* unmapped character */
8143 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 o1 = PyUnicode_READ(kind, data, i)>>11;
8145 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 i2 = 16*mlevel1[o1] + o2;
8147 if (mlevel2[i2] == 0xFF)
8148 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 i3 = 128*mlevel2[i2] + o3;
8151 mlevel3[i3] = i;
8152 }
8153 return result;
8154}
8155
8156static int
Victor Stinner22168992011-11-20 17:09:18 +01008157encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158{
8159 struct encoding_map *map = (struct encoding_map*)mapping;
8160 int l1 = c>>11;
8161 int l2 = (c>>7) & 0xF;
8162 int l3 = c & 0x7F;
8163 int i;
8164
Victor Stinner22168992011-11-20 17:09:18 +01008165 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167 if (c == 0)
8168 return 0;
8169 /* level 1*/
8170 i = map->level1[l1];
8171 if (i == 0xFF) {
8172 return -1;
8173 }
8174 /* level 2*/
8175 i = map->level23[16*i+l2];
8176 if (i == 0xFF) {
8177 return -1;
8178 }
8179 /* level 3 */
8180 i = map->level23[16*map->count2 + 128*i + l3];
8181 if (i == 0) {
8182 return -1;
8183 }
8184 return i;
8185}
8186
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008187/* Lookup the character ch in the mapping. If the character
8188 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008189 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008190static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008191charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192{
Christian Heimes217cfd12007-12-02 14:31:20 +00008193 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194 PyObject *x;
8195
8196 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 x = PyObject_GetItem(mapping, w);
8199 Py_DECREF(w);
8200 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8202 /* No mapping found means: mapping is undefined. */
8203 PyErr_Clear();
8204 x = Py_None;
8205 Py_INCREF(x);
8206 return x;
8207 } else
8208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008210 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008212 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 long value = PyLong_AS_LONG(x);
8214 if (value < 0 || value > 255) {
8215 PyErr_SetString(PyExc_TypeError,
8216 "character mapping must be in range(256)");
8217 Py_DECREF(x);
8218 return NULL;
8219 }
8220 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008222 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 /* wrong return value */
8226 PyErr_Format(PyExc_TypeError,
8227 "character mapping must return integer, bytes or None, not %.400s",
8228 x->ob_type->tp_name);
8229 Py_DECREF(x);
8230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 }
8232}
8233
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008234static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008235charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008236{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008237 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8238 /* exponentially overallocate to minimize reallocations */
8239 if (requiredsize < 2*outsize)
8240 requiredsize = 2*outsize;
8241 if (_PyBytes_Resize(outobj, requiredsize))
8242 return -1;
8243 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008244}
8245
Benjamin Peterson14339b62009-01-31 16:36:08 +00008246typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008248} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008250 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 space is available. Return a new reference to the object that
8252 was put in the output buffer, or Py_None, if the mapping was undefined
8253 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008254 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008255static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008256charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008257 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008259 PyObject *rep;
8260 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008261 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262
Christian Heimes90aa7642007-12-19 02:45:37 +00008263 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266 if (res == -1)
8267 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 if (outsize<requiredsize)
8269 if (charmapencode_resize(outobj, outpos, requiredsize))
8270 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008271 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 outstart[(*outpos)++] = (char)res;
8273 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008274 }
8275
8276 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 Py_DECREF(rep);
8281 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008282 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 if (PyLong_Check(rep)) {
8284 Py_ssize_t requiredsize = *outpos+1;
8285 if (outsize<requiredsize)
8286 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8287 Py_DECREF(rep);
8288 return enc_EXCEPTION;
8289 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008290 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 else {
8294 const char *repchars = PyBytes_AS_STRING(rep);
8295 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8296 Py_ssize_t requiredsize = *outpos+repsize;
8297 if (outsize<requiredsize)
8298 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8299 Py_DECREF(rep);
8300 return enc_EXCEPTION;
8301 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008302 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 memcpy(outstart + *outpos, repchars, repsize);
8304 *outpos += repsize;
8305 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307 Py_DECREF(rep);
8308 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309}
8310
8311/* handle an error in PyUnicode_EncodeCharmap
8312 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008313static int
8314charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008315 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008317 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008318 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319{
8320 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008321 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008322 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008323 enum PyUnicode_Kind kind;
8324 void *data;
8325 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008327 Py_ssize_t collstartpos = *inpos;
8328 Py_ssize_t collendpos = *inpos+1;
8329 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 char *encoding = "charmap";
8331 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008333 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008334 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335
Benjamin Petersonbac79492012-01-14 13:34:47 -05008336 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008337 return -1;
8338 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 /* find all unencodable characters */
8340 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008341 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008342 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008343 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008344 val = encoding_map_lookup(ch, mapping);
8345 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 break;
8347 ++collendpos;
8348 continue;
8349 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008350
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008351 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8352 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 if (rep==NULL)
8354 return -1;
8355 else if (rep!=Py_None) {
8356 Py_DECREF(rep);
8357 break;
8358 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008359 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 }
8362 /* cache callback name lookup
8363 * (if not done yet, i.e. it's the first error) */
8364 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 if ((errors==NULL) || (!strcmp(errors, "strict")))
8366 *known_errorHandler = 1;
8367 else if (!strcmp(errors, "replace"))
8368 *known_errorHandler = 2;
8369 else if (!strcmp(errors, "ignore"))
8370 *known_errorHandler = 3;
8371 else if (!strcmp(errors, "xmlcharrefreplace"))
8372 *known_errorHandler = 4;
8373 else
8374 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 }
8376 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008377 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008378 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008379 return -1;
8380 case 2: /* replace */
8381 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 x = charmapencode_output('?', mapping, res, respos);
8383 if (x==enc_EXCEPTION) {
8384 return -1;
8385 }
8386 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008387 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 return -1;
8389 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 }
8391 /* fall through */
8392 case 3: /* ignore */
8393 *inpos = collendpos;
8394 break;
8395 case 4: /* xmlcharrefreplace */
8396 /* generate replacement (temporarily (mis)uses p) */
8397 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 char buffer[2+29+1+1];
8399 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008400 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 for (cp = buffer; *cp; ++cp) {
8402 x = charmapencode_output(*cp, mapping, res, respos);
8403 if (x==enc_EXCEPTION)
8404 return -1;
8405 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008406 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 return -1;
8408 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008409 }
8410 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008411 *inpos = collendpos;
8412 break;
8413 default:
8414 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008415 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008417 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008419 if (PyBytes_Check(repunicode)) {
8420 /* Directly copy bytes result to output. */
8421 Py_ssize_t outsize = PyBytes_Size(*res);
8422 Py_ssize_t requiredsize;
8423 repsize = PyBytes_Size(repunicode);
8424 requiredsize = *respos + repsize;
8425 if (requiredsize > outsize)
8426 /* Make room for all additional bytes. */
8427 if (charmapencode_resize(res, respos, requiredsize)) {
8428 Py_DECREF(repunicode);
8429 return -1;
8430 }
8431 memcpy(PyBytes_AsString(*res) + *respos,
8432 PyBytes_AsString(repunicode), repsize);
8433 *respos += repsize;
8434 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008435 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008436 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008437 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008439 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008440 Py_DECREF(repunicode);
8441 return -1;
8442 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008443 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008444 data = PyUnicode_DATA(repunicode);
8445 kind = PyUnicode_KIND(repunicode);
8446 for (index = 0; index < repsize; index++) {
8447 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8448 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008450 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return -1;
8452 }
8453 else if (x==enc_FAILED) {
8454 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008455 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return -1;
8457 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 }
8459 *inpos = newpos;
8460 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 }
8462 return 0;
8463}
8464
Alexander Belopolsky40018472011-02-26 01:02:56 +00008465PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008466_PyUnicode_EncodeCharmap(PyObject *unicode,
8467 PyObject *mapping,
8468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470 /* output object */
8471 PyObject *res = NULL;
8472 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008473 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008474 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008476 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008477 PyObject *errorHandler = NULL;
8478 PyObject *exc = NULL;
8479 /* the following variable is used for caching string comparisons
8480 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8481 * 3=ignore, 4=xmlcharrefreplace */
8482 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483
Benjamin Petersonbac79492012-01-14 13:34:47 -05008484 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008485 return NULL;
8486 size = PyUnicode_GET_LENGTH(unicode);
8487
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 /* Default to Latin-1 */
8489 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008490 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 /* allocate enough for a simple encoding without
8493 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008494 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495 if (res == NULL)
8496 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008497 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008500 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008501 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008503 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 if (x==enc_EXCEPTION) /* error */
8505 goto onError;
8506 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008507 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 &exc,
8509 &known_errorHandler, &errorHandler, errors,
8510 &res, &respos)) {
8511 goto onError;
8512 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 else
8515 /* done with this character => adjust input position */
8516 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008520 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008521 if (_PyBytes_Resize(&res, respos) < 0)
8522 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 Py_XDECREF(exc);
8525 Py_XDECREF(errorHandler);
8526 return res;
8527
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 Py_XDECREF(res);
8530 Py_XDECREF(exc);
8531 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532 return NULL;
8533}
8534
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008535/* Deprecated */
8536PyObject *
8537PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8538 Py_ssize_t size,
8539 PyObject *mapping,
8540 const char *errors)
8541{
8542 PyObject *result;
8543 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8544 if (unicode == NULL)
8545 return NULL;
8546 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8547 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008548 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008549}
8550
Alexander Belopolsky40018472011-02-26 01:02:56 +00008551PyObject *
8552PyUnicode_AsCharmapString(PyObject *unicode,
8553 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554{
8555 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 PyErr_BadArgument();
8557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008559 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560}
8561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008563static void
8564make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008566 Py_ssize_t startpos, Py_ssize_t endpos,
8567 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 *exceptionObject = _PyUnicodeTranslateError_Create(
8571 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 }
8573 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8575 goto onError;
8576 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8577 goto onError;
8578 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8579 goto onError;
8580 return;
8581 onError:
8582 Py_DECREF(*exceptionObject);
8583 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 }
8585}
8586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008588static void
8589raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008591 Py_ssize_t startpos, Py_ssize_t endpos,
8592 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593{
8594 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598}
8599
8600/* error handling callback helper:
8601 build arguments, call the callback and check the arguments,
8602 put the result into newpos and return the replacement string, which
8603 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008604static PyObject *
8605unicode_translate_call_errorhandler(const char *errors,
8606 PyObject **errorHandler,
8607 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008609 Py_ssize_t startpos, Py_ssize_t endpos,
8610 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008612 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008614 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008615 PyObject *restuple;
8616 PyObject *resunicode;
8617
8618 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622 }
8623
8624 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628
8629 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008634 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 Py_DECREF(restuple);
8636 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 }
8638 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 &resunicode, &i_newpos)) {
8640 Py_DECREF(restuple);
8641 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008643 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008645 else
8646 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8649 Py_DECREF(restuple);
8650 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008651 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652 Py_INCREF(resunicode);
8653 Py_DECREF(restuple);
8654 return resunicode;
8655}
8656
8657/* Lookup the character ch in the mapping and put the result in result,
8658 which must be decrefed by the caller.
8659 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662{
Christian Heimes217cfd12007-12-02 14:31:20 +00008663 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664 PyObject *x;
8665
8666 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 x = PyObject_GetItem(mapping, w);
8669 Py_DECREF(w);
8670 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8672 /* No mapping found means: use 1:1 mapping. */
8673 PyErr_Clear();
8674 *result = NULL;
8675 return 0;
8676 } else
8677 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678 }
8679 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 *result = x;
8681 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008683 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 long value = PyLong_AS_LONG(x);
8685 long max = PyUnicode_GetMax();
8686 if (value < 0 || value > max) {
8687 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008688 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 Py_DECREF(x);
8690 return -1;
8691 }
8692 *result = x;
8693 return 0;
8694 }
8695 else if (PyUnicode_Check(x)) {
8696 *result = x;
8697 return 0;
8698 }
8699 else {
8700 /* wrong return value */
8701 PyErr_SetString(PyExc_TypeError,
8702 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008703 Py_DECREF(x);
8704 return -1;
8705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706}
8707/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 if not reallocate and adjust various state variables.
8709 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008710static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008715 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 /* exponentially overallocate to minimize reallocations */
8717 if (requiredsize < 2 * oldsize)
8718 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8720 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 }
8724 return 0;
8725}
8726/* lookup the character, put the result in the output string and adjust
8727 various state variables. Return a new reference to the object that
8728 was put in the output buffer in *result, or Py_None, if the mapping was
8729 undefined (in which case no character was written).
8730 The called must decref result.
8731 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008732static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8734 PyObject *mapping, Py_UCS4 **output,
8735 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008736 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8739 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008744 }
8745 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008747 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 }
8751 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008752 Py_ssize_t repsize;
8753 if (PyUnicode_READY(*res) == -1)
8754 return -1;
8755 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 if (repsize==1) {
8757 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 }
8760 else if (repsize!=0) {
8761 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 Py_ssize_t requiredsize = *opos +
8763 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 Py_ssize_t i;
8766 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 for(i = 0; i < repsize; i++)
8769 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008771 }
8772 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008774 return 0;
8775}
8776
Alexander Belopolsky40018472011-02-26 01:02:56 +00008777PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778_PyUnicode_TranslateCharmap(PyObject *input,
8779 PyObject *mapping,
8780 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 /* input object */
8783 char *idata;
8784 Py_ssize_t size, i;
8785 int kind;
8786 /* output buffer */
8787 Py_UCS4 *output = NULL;
8788 Py_ssize_t osize;
8789 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792 char *reason = "character maps to <undefined>";
8793 PyObject *errorHandler = NULL;
8794 PyObject *exc = NULL;
8795 /* the following variable is used for caching string comparisons
8796 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8797 * 3=ignore, 4=xmlcharrefreplace */
8798 int known_errorHandler = -1;
8799
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 PyErr_BadArgument();
8802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 if (PyUnicode_READY(input) == -1)
8806 return NULL;
8807 idata = (char*)PyUnicode_DATA(input);
8808 kind = PyUnicode_KIND(input);
8809 size = PyUnicode_GET_LENGTH(input);
8810 i = 0;
8811
8812 if (size == 0) {
8813 Py_INCREF(input);
8814 return input;
8815 }
8816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008817 /* allocate enough for a simple 1:1 translation without
8818 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 osize = size;
8820 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8821 opos = 0;
8822 if (output == NULL) {
8823 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 /* try to encode it */
8829 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 if (charmaptranslate_output(input, i, mapping,
8831 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 Py_XDECREF(x);
8833 goto onError;
8834 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008835 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008838 else { /* untranslatable character */
8839 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8840 Py_ssize_t repsize;
8841 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 Py_ssize_t collstart = i;
8845 Py_ssize_t collend = i+1;
8846 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 while (collend < size) {
8850 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 goto onError;
8852 Py_XDECREF(x);
8853 if (x!=Py_None)
8854 break;
8855 ++collend;
8856 }
8857 /* cache callback name lookup
8858 * (if not done yet, i.e. it's the first error) */
8859 if (known_errorHandler==-1) {
8860 if ((errors==NULL) || (!strcmp(errors, "strict")))
8861 known_errorHandler = 1;
8862 else if (!strcmp(errors, "replace"))
8863 known_errorHandler = 2;
8864 else if (!strcmp(errors, "ignore"))
8865 known_errorHandler = 3;
8866 else if (!strcmp(errors, "xmlcharrefreplace"))
8867 known_errorHandler = 4;
8868 else
8869 known_errorHandler = 0;
8870 }
8871 switch (known_errorHandler) {
8872 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 raise_translate_exception(&exc, input, collstart,
8874 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008875 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 case 2: /* replace */
8877 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 for (coll = collstart; coll<collend; coll++)
8879 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 /* fall through */
8881 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 break;
8884 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 /* generate replacement (temporarily (mis)uses i) */
8886 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 char buffer[2+29+1+1];
8888 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8890 if (charmaptranslate_makespace(&output, &osize,
8891 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 goto onError;
8893 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 break;
8898 default:
8899 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 reason, input, &exc,
8901 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008902 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008904 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008905 Py_DECREF(repunicode);
8906 goto onError;
8907 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 repsize = PyUnicode_GET_LENGTH(repunicode);
8910 if (charmaptranslate_makespace(&output, &osize,
8911 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 Py_DECREF(repunicode);
8913 goto onError;
8914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 for (uni2 = 0; repsize-->0; ++uni2)
8916 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8917 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008919 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008920 }
8921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8923 if (!res)
8924 goto onError;
8925 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008926 Py_XDECREF(exc);
8927 Py_XDECREF(errorHandler);
8928 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008932 Py_XDECREF(exc);
8933 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 return NULL;
8935}
8936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937/* Deprecated. Use PyUnicode_Translate instead. */
8938PyObject *
8939PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8940 Py_ssize_t size,
8941 PyObject *mapping,
8942 const char *errors)
8943{
8944 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8945 if (!unicode)
8946 return NULL;
8947 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8948}
8949
Alexander Belopolsky40018472011-02-26 01:02:56 +00008950PyObject *
8951PyUnicode_Translate(PyObject *str,
8952 PyObject *mapping,
8953 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954{
8955 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008956
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 str = PyUnicode_FromObject(str);
8958 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 Py_DECREF(str);
8962 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008963
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 Py_XDECREF(str);
8966 return NULL;
8967}
Tim Petersced69f82003-09-16 20:30:58 +00008968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008970fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971{
8972 /* No need to call PyUnicode_READY(self) because this function is only
8973 called as a callback from fixup() which does it already. */
8974 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8975 const int kind = PyUnicode_KIND(self);
8976 void *data = PyUnicode_DATA(self);
8977 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008978 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 Py_ssize_t i;
8980
8981 for (i = 0; i < len; ++i) {
8982 ch = PyUnicode_READ(kind, data, i);
8983 fixed = 0;
8984 if (ch > 127) {
8985 if (Py_UNICODE_ISSPACE(ch))
8986 fixed = ' ';
8987 else {
8988 const int decimal = Py_UNICODE_TODECIMAL(ch);
8989 if (decimal >= 0)
8990 fixed = '0' + decimal;
8991 }
8992 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008993 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 if (fixed > maxchar)
8995 maxchar = fixed;
8996 PyUnicode_WRITE(kind, data, i, fixed);
8997 }
8998 else if (ch > maxchar)
8999 maxchar = ch;
9000 }
9001 else if (ch > maxchar)
9002 maxchar = ch;
9003 }
9004
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009005 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006}
9007
9008PyObject *
9009_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9010{
9011 if (!PyUnicode_Check(unicode)) {
9012 PyErr_BadInternalCall();
9013 return NULL;
9014 }
9015 if (PyUnicode_READY(unicode) == -1)
9016 return NULL;
9017 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9018 /* If the string is already ASCII, just return the same string */
9019 Py_INCREF(unicode);
9020 return unicode;
9021 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009022 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023}
9024
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009025PyObject *
9026PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9027 Py_ssize_t length)
9028{
Victor Stinnerf0124502011-11-21 23:12:56 +01009029 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009030 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009031 Py_UCS4 maxchar;
9032 enum PyUnicode_Kind kind;
9033 void *data;
9034
Victor Stinner99d7ad02012-02-22 13:37:39 +01009035 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009036 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009037 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009038 if (ch > 127) {
9039 int decimal = Py_UNICODE_TODECIMAL(ch);
9040 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009041 ch = '0' + decimal;
Victor Stinner99d7ad02012-02-22 13:37:39 +01009042 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009043 }
9044 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009045
9046 /* Copy to a new string */
9047 decimal = PyUnicode_New(length, maxchar);
9048 if (decimal == NULL)
9049 return decimal;
9050 kind = PyUnicode_KIND(decimal);
9051 data = PyUnicode_DATA(decimal);
9052 /* Iterate over code points */
9053 for (i = 0; i < length; i++) {
9054 Py_UNICODE ch = s[i];
9055 if (ch > 127) {
9056 int decimal = Py_UNICODE_TODECIMAL(ch);
9057 if (decimal >= 0)
9058 ch = '0' + decimal;
9059 }
9060 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009062 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009063}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009064/* --- Decimal Encoder ---------------------------------------------------- */
9065
Alexander Belopolsky40018472011-02-26 01:02:56 +00009066int
9067PyUnicode_EncodeDecimal(Py_UNICODE *s,
9068 Py_ssize_t length,
9069 char *output,
9070 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009071{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009072 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009073 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009074 enum PyUnicode_Kind kind;
9075 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009076
9077 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 PyErr_BadArgument();
9079 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009080 }
9081
Victor Stinner42bf7752011-11-21 22:52:58 +01009082 unicode = PyUnicode_FromUnicode(s, length);
9083 if (unicode == NULL)
9084 return -1;
9085
Benjamin Petersonbac79492012-01-14 13:34:47 -05009086 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009087 Py_DECREF(unicode);
9088 return -1;
9089 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009090 kind = PyUnicode_KIND(unicode);
9091 data = PyUnicode_DATA(unicode);
9092
Victor Stinnerb84d7232011-11-22 01:50:07 +01009093 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009094 PyObject *exc;
9095 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009097 Py_ssize_t startpos;
9098
9099 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009100
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009102 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009103 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009104 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009105 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 decimal = Py_UNICODE_TODECIMAL(ch);
9107 if (decimal >= 0) {
9108 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009109 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 continue;
9111 }
9112 if (0 < ch && ch < 256) {
9113 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009114 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 continue;
9116 }
Victor Stinner6345be92011-11-25 20:09:01 +01009117
Victor Stinner42bf7752011-11-21 22:52:58 +01009118 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009119 exc = NULL;
9120 raise_encode_exception(&exc, "decimal", unicode,
9121 startpos, startpos+1,
9122 "invalid decimal Unicode string");
9123 Py_XDECREF(exc);
9124 Py_DECREF(unicode);
9125 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009126 }
9127 /* 0-terminate the output string */
9128 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009129 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009130 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009131}
9132
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133/* --- Helpers ------------------------------------------------------------ */
9134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009135static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009136any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137 Py_ssize_t start,
9138 Py_ssize_t end)
9139{
9140 int kind1, kind2, kind;
9141 void *buf1, *buf2;
9142 Py_ssize_t len1, len2, result;
9143
9144 kind1 = PyUnicode_KIND(s1);
9145 kind2 = PyUnicode_KIND(s2);
9146 kind = kind1 > kind2 ? kind1 : kind2;
9147 buf1 = PyUnicode_DATA(s1);
9148 buf2 = PyUnicode_DATA(s2);
9149 if (kind1 != kind)
9150 buf1 = _PyUnicode_AsKind(s1, kind);
9151 if (!buf1)
9152 return -2;
9153 if (kind2 != kind)
9154 buf2 = _PyUnicode_AsKind(s2, kind);
9155 if (!buf2) {
9156 if (kind1 != kind) PyMem_Free(buf1);
9157 return -2;
9158 }
9159 len1 = PyUnicode_GET_LENGTH(s1);
9160 len2 = PyUnicode_GET_LENGTH(s2);
9161
Victor Stinner794d5672011-10-10 03:21:36 +02009162 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009163 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009164 case PyUnicode_1BYTE_KIND:
9165 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9166 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9167 else
9168 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9169 break;
9170 case PyUnicode_2BYTE_KIND:
9171 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9172 break;
9173 case PyUnicode_4BYTE_KIND:
9174 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9175 break;
9176 default:
9177 assert(0); result = -2;
9178 }
9179 }
9180 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009181 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009182 case PyUnicode_1BYTE_KIND:
9183 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9184 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9185 else
9186 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9187 break;
9188 case PyUnicode_2BYTE_KIND:
9189 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9190 break;
9191 case PyUnicode_4BYTE_KIND:
9192 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9193 break;
9194 default:
9195 assert(0); result = -2;
9196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 }
9198
9199 if (kind1 != kind)
9200 PyMem_Free(buf1);
9201 if (kind2 != kind)
9202 PyMem_Free(buf2);
9203
9204 return result;
9205}
9206
9207Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009208_PyUnicode_InsertThousandsGrouping(
9209 PyObject *unicode, Py_ssize_t index,
9210 Py_ssize_t n_buffer,
9211 void *digits, Py_ssize_t n_digits,
9212 Py_ssize_t min_width,
9213 const char *grouping, PyObject *thousands_sep,
9214 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009215{
Victor Stinner41a863c2012-02-24 00:37:51 +01009216 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009217 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009218 Py_ssize_t thousands_sep_len;
9219 Py_ssize_t len;
9220
9221 if (unicode != NULL) {
9222 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009223 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009224 }
9225 else {
9226 kind = PyUnicode_1BYTE_KIND;
9227 data = NULL;
9228 }
9229 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9230 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9231 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9232 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009233 if (thousands_sep_kind < kind) {
9234 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9235 if (!thousands_sep_data)
9236 return -1;
9237 }
9238 else {
9239 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9240 if (!data)
9241 return -1;
9242 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009243 }
9244
Benjamin Petersonead6b532011-12-20 17:23:42 -06009245 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009247 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009248 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009249 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009250 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009251 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009252 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009253 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009254 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009255 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009256 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009257 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009259 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009260 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009261 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009262 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009263 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009265 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009266 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009267 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009268 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009269 break;
9270 default:
9271 assert(0);
9272 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009274 if (unicode != NULL && thousands_sep_kind != kind) {
9275 if (thousands_sep_kind < kind)
9276 PyMem_Free(thousands_sep_data);
9277 else
9278 PyMem_Free(data);
9279 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009280 if (unicode == NULL) {
9281 *maxchar = 127;
9282 if (len != n_digits) {
9283 *maxchar = Py_MAX(*maxchar,
9284 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9285 }
9286 }
9287 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288}
9289
9290
Thomas Wouters477c8d52006-05-27 19:21:47 +00009291/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009292#define ADJUST_INDICES(start, end, len) \
9293 if (end > len) \
9294 end = len; \
9295 else if (end < 0) { \
9296 end += len; \
9297 if (end < 0) \
9298 end = 0; \
9299 } \
9300 if (start < 0) { \
9301 start += len; \
9302 if (start < 0) \
9303 start = 0; \
9304 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009305
Alexander Belopolsky40018472011-02-26 01:02:56 +00009306Py_ssize_t
9307PyUnicode_Count(PyObject *str,
9308 PyObject *substr,
9309 Py_ssize_t start,
9310 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009312 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009313 PyObject* str_obj;
9314 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 int kind1, kind2, kind;
9316 void *buf1 = NULL, *buf2 = NULL;
9317 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009318
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009319 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009320 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009321 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009322 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009323 if (!sub_obj) {
9324 Py_DECREF(str_obj);
9325 return -1;
9326 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009327 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009328 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009329 Py_DECREF(str_obj);
9330 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009331 }
Tim Petersced69f82003-09-16 20:30:58 +00009332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 kind1 = PyUnicode_KIND(str_obj);
9334 kind2 = PyUnicode_KIND(sub_obj);
9335 kind = kind1 > kind2 ? kind1 : kind2;
9336 buf1 = PyUnicode_DATA(str_obj);
9337 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009338 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 if (!buf1)
9340 goto onError;
9341 buf2 = PyUnicode_DATA(sub_obj);
9342 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009343 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 if (!buf2)
9345 goto onError;
9346 len1 = PyUnicode_GET_LENGTH(str_obj);
9347 len2 = PyUnicode_GET_LENGTH(sub_obj);
9348
9349 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009350 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009352 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9353 result = asciilib_count(
9354 ((Py_UCS1*)buf1) + start, end - start,
9355 buf2, len2, PY_SSIZE_T_MAX
9356 );
9357 else
9358 result = ucs1lib_count(
9359 ((Py_UCS1*)buf1) + start, end - start,
9360 buf2, len2, PY_SSIZE_T_MAX
9361 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 break;
9363 case PyUnicode_2BYTE_KIND:
9364 result = ucs2lib_count(
9365 ((Py_UCS2*)buf1) + start, end - start,
9366 buf2, len2, PY_SSIZE_T_MAX
9367 );
9368 break;
9369 case PyUnicode_4BYTE_KIND:
9370 result = ucs4lib_count(
9371 ((Py_UCS4*)buf1) + start, end - start,
9372 buf2, len2, PY_SSIZE_T_MAX
9373 );
9374 break;
9375 default:
9376 assert(0); result = 0;
9377 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009378
9379 Py_DECREF(sub_obj);
9380 Py_DECREF(str_obj);
9381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 if (kind1 != kind)
9383 PyMem_Free(buf1);
9384 if (kind2 != kind)
9385 PyMem_Free(buf2);
9386
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 onError:
9389 Py_DECREF(sub_obj);
9390 Py_DECREF(str_obj);
9391 if (kind1 != kind && buf1)
9392 PyMem_Free(buf1);
9393 if (kind2 != kind && buf2)
9394 PyMem_Free(buf2);
9395 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396}
9397
Alexander Belopolsky40018472011-02-26 01:02:56 +00009398Py_ssize_t
9399PyUnicode_Find(PyObject *str,
9400 PyObject *sub,
9401 Py_ssize_t start,
9402 Py_ssize_t end,
9403 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009405 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009406
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009408 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009409 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009410 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009411 if (!sub) {
9412 Py_DECREF(str);
9413 return -2;
9414 }
9415 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9416 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 Py_DECREF(str);
9418 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419 }
Tim Petersced69f82003-09-16 20:30:58 +00009420
Victor Stinner794d5672011-10-10 03:21:36 +02009421 result = any_find_slice(direction,
9422 str, sub, start, end
9423 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009424
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009426 Py_DECREF(sub);
9427
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428 return result;
9429}
9430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431Py_ssize_t
9432PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9433 Py_ssize_t start, Py_ssize_t end,
9434 int direction)
9435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009437 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 if (PyUnicode_READY(str) == -1)
9439 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009440 if (start < 0 || end < 0) {
9441 PyErr_SetString(PyExc_IndexError, "string index out of range");
9442 return -2;
9443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 if (end > PyUnicode_GET_LENGTH(str))
9445 end = PyUnicode_GET_LENGTH(str);
9446 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009447 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9448 kind, end-start, ch, direction);
9449 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009451 else
9452 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453}
9454
Alexander Belopolsky40018472011-02-26 01:02:56 +00009455static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009456tailmatch(PyObject *self,
9457 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009458 Py_ssize_t start,
9459 Py_ssize_t end,
9460 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 int kind_self;
9463 int kind_sub;
9464 void *data_self;
9465 void *data_sub;
9466 Py_ssize_t offset;
9467 Py_ssize_t i;
9468 Py_ssize_t end_sub;
9469
9470 if (PyUnicode_READY(self) == -1 ||
9471 PyUnicode_READY(substring) == -1)
9472 return 0;
9473
9474 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 return 1;
9476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9478 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 kind_self = PyUnicode_KIND(self);
9483 data_self = PyUnicode_DATA(self);
9484 kind_sub = PyUnicode_KIND(substring);
9485 data_sub = PyUnicode_DATA(substring);
9486 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9487
9488 if (direction > 0)
9489 offset = end;
9490 else
9491 offset = start;
9492
9493 if (PyUnicode_READ(kind_self, data_self, offset) ==
9494 PyUnicode_READ(kind_sub, data_sub, 0) &&
9495 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9496 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9497 /* If both are of the same kind, memcmp is sufficient */
9498 if (kind_self == kind_sub) {
9499 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009500 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 data_sub,
9502 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009503 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 }
9505 /* otherwise we have to compare each character by first accesing it */
9506 else {
9507 /* We do not need to compare 0 and len(substring)-1 because
9508 the if statement above ensured already that they are equal
9509 when we end up here. */
9510 // TODO: honor direction and do a forward or backwards search
9511 for (i = 1; i < end_sub; ++i) {
9512 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9513 PyUnicode_READ(kind_sub, data_sub, i))
9514 return 0;
9515 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 }
9519
9520 return 0;
9521}
9522
Alexander Belopolsky40018472011-02-26 01:02:56 +00009523Py_ssize_t
9524PyUnicode_Tailmatch(PyObject *str,
9525 PyObject *substr,
9526 Py_ssize_t start,
9527 Py_ssize_t end,
9528 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009530 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009531
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 str = PyUnicode_FromObject(str);
9533 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 substr = PyUnicode_FromObject(substr);
9536 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 Py_DECREF(str);
9538 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 }
Tim Petersced69f82003-09-16 20:30:58 +00009540
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009541 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 Py_DECREF(str);
9544 Py_DECREF(substr);
9545 return result;
9546}
9547
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548/* Apply fixfct filter to the Unicode object self and return a
9549 reference to the modified object */
9550
Alexander Belopolsky40018472011-02-26 01:02:56 +00009551static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009552fixup(PyObject *self,
9553 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 PyObject *u;
9556 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009557 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009559 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009561 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009562 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 /* fix functions return the new maximum character in a string,
9565 if the kind of the resulting unicode object does not change,
9566 everything is fine. Otherwise we need to change the string kind
9567 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009568 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009569
9570 if (maxchar_new == 0) {
9571 /* no changes */;
9572 if (PyUnicode_CheckExact(self)) {
9573 Py_DECREF(u);
9574 Py_INCREF(self);
9575 return self;
9576 }
9577 else
9578 return u;
9579 }
9580
9581 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 maxchar_new = 127;
9583 else if (maxchar_new <= 255)
9584 maxchar_new = 255;
9585 else if (maxchar_new <= 65535)
9586 maxchar_new = 65535;
9587 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009588 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589
Victor Stinnereaab6042011-12-11 22:22:39 +01009590 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009592
9593 /* In case the maximum character changed, we need to
9594 convert the string to the new category. */
9595 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9596 if (v == NULL) {
9597 Py_DECREF(u);
9598 return NULL;
9599 }
9600 if (maxchar_new > maxchar_old) {
9601 /* If the maxchar increased so that the kind changed, not all
9602 characters are representable anymore and we need to fix the
9603 string again. This only happens in very few cases. */
9604 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9605 maxchar_old = fixfct(v);
9606 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 }
9608 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009609 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009611 Py_DECREF(u);
9612 assert(_PyUnicode_CheckConsistency(v, 1));
9613 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614}
9615
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009616static PyObject *
9617ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009619 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9620 char *resdata, *data = PyUnicode_DATA(self);
9621 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009622
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009623 res = PyUnicode_New(len, 127);
9624 if (res == NULL)
9625 return NULL;
9626 resdata = PyUnicode_DATA(res);
9627 if (lower)
9628 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009630 _Py_bytes_upper(resdata, data, len);
9631 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632}
9633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009635handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637 Py_ssize_t j;
9638 int final_sigma;
9639 Py_UCS4 c;
9640 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009641
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9643
9644 where ! is a negation and \p{xxx} is a character with property xxx.
9645 */
9646 for (j = i - 1; j >= 0; j--) {
9647 c = PyUnicode_READ(kind, data, j);
9648 if (!_PyUnicode_IsCaseIgnorable(c))
9649 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9652 if (final_sigma) {
9653 for (j = i + 1; j < length; j++) {
9654 c = PyUnicode_READ(kind, data, j);
9655 if (!_PyUnicode_IsCaseIgnorable(c))
9656 break;
9657 }
9658 final_sigma = j == length || !_PyUnicode_IsCased(c);
9659 }
9660 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661}
9662
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663static int
9664lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9665 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667 /* Obscure special case. */
9668 if (c == 0x3A3) {
9669 mapped[0] = handle_capital_sigma(kind, data, length, i);
9670 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673}
9674
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675static Py_ssize_t
9676do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678 Py_ssize_t i, k = 0;
9679 int n_res, j;
9680 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009681
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009682 c = PyUnicode_READ(kind, data, 0);
9683 n_res = _PyUnicode_ToUpperFull(c, mapped);
9684 for (j = 0; j < n_res; j++) {
9685 if (mapped[j] > *maxchar)
9686 *maxchar = mapped[j];
9687 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009689 for (i = 1; i < length; i++) {
9690 c = PyUnicode_READ(kind, data, i);
9691 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9692 for (j = 0; j < n_res; j++) {
9693 if (mapped[j] > *maxchar)
9694 *maxchar = mapped[j];
9695 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009696 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009697 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699}
9700
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701static Py_ssize_t
9702do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9703 Py_ssize_t i, k = 0;
9704
9705 for (i = 0; i < length; i++) {
9706 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9707 int n_res, j;
9708 if (Py_UNICODE_ISUPPER(c)) {
9709 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9710 }
9711 else if (Py_UNICODE_ISLOWER(c)) {
9712 n_res = _PyUnicode_ToUpperFull(c, mapped);
9713 }
9714 else {
9715 n_res = 1;
9716 mapped[0] = c;
9717 }
9718 for (j = 0; j < n_res; j++) {
9719 if (mapped[j] > *maxchar)
9720 *maxchar = mapped[j];
9721 res[k++] = mapped[j];
9722 }
9723 }
9724 return k;
9725}
9726
9727static Py_ssize_t
9728do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9729 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 Py_ssize_t i, k = 0;
9732
9733 for (i = 0; i < length; i++) {
9734 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9735 int n_res, j;
9736 if (lower)
9737 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9738 else
9739 n_res = _PyUnicode_ToUpperFull(c, mapped);
9740 for (j = 0; j < n_res; j++) {
9741 if (mapped[j] > *maxchar)
9742 *maxchar = mapped[j];
9743 res[k++] = mapped[j];
9744 }
9745 }
9746 return k;
9747}
9748
9749static Py_ssize_t
9750do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9751{
9752 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9753}
9754
9755static Py_ssize_t
9756do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9757{
9758 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9759}
9760
Benjamin Petersone51757f2012-01-12 21:10:29 -05009761static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009762do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9763{
9764 Py_ssize_t i, k = 0;
9765
9766 for (i = 0; i < length; i++) {
9767 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9768 Py_UCS4 mapped[3];
9769 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9770 for (j = 0; j < n_res; j++) {
9771 if (mapped[j] > *maxchar)
9772 *maxchar = mapped[j];
9773 res[k++] = mapped[j];
9774 }
9775 }
9776 return k;
9777}
9778
9779static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009780do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9781{
9782 Py_ssize_t i, k = 0;
9783 int previous_is_cased;
9784
9785 previous_is_cased = 0;
9786 for (i = 0; i < length; i++) {
9787 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9788 Py_UCS4 mapped[3];
9789 int n_res, j;
9790
9791 if (previous_is_cased)
9792 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9793 else
9794 n_res = _PyUnicode_ToTitleFull(c, mapped);
9795
9796 for (j = 0; j < n_res; j++) {
9797 if (mapped[j] > *maxchar)
9798 *maxchar = mapped[j];
9799 res[k++] = mapped[j];
9800 }
9801
9802 previous_is_cased = _PyUnicode_IsCased(c);
9803 }
9804 return k;
9805}
9806
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009807static PyObject *
9808case_operation(PyObject *self,
9809 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9810{
9811 PyObject *res = NULL;
9812 Py_ssize_t length, newlength = 0;
9813 int kind, outkind;
9814 void *data, *outdata;
9815 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9816
Benjamin Petersoneea48462012-01-16 14:28:50 -05009817 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009818
9819 kind = PyUnicode_KIND(self);
9820 data = PyUnicode_DATA(self);
9821 length = PyUnicode_GET_LENGTH(self);
9822 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9823 if (tmp == NULL)
9824 return PyErr_NoMemory();
9825 newlength = perform(kind, data, length, tmp, &maxchar);
9826 res = PyUnicode_New(newlength, maxchar);
9827 if (res == NULL)
9828 goto leave;
9829 tmpend = tmp + newlength;
9830 outdata = PyUnicode_DATA(res);
9831 outkind = PyUnicode_KIND(res);
9832 switch (outkind) {
9833 case PyUnicode_1BYTE_KIND:
9834 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9835 break;
9836 case PyUnicode_2BYTE_KIND:
9837 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9838 break;
9839 case PyUnicode_4BYTE_KIND:
9840 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9841 break;
9842 default:
9843 assert(0);
9844 break;
9845 }
9846 leave:
9847 PyMem_FREE(tmp);
9848 return res;
9849}
9850
Tim Peters8ce9f162004-08-27 01:49:32 +00009851PyObject *
9852PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009855 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009857 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009858 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9859 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009860 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009862 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009864 int use_memcpy;
9865 unsigned char *res_data = NULL, *sep_data = NULL;
9866 PyObject *last_obj;
9867 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868
Tim Peters05eba1f2004-08-27 21:32:02 +00009869 fseq = PySequence_Fast(seq, "");
9870 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009871 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009872 }
9873
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009874 /* NOTE: the following code can't call back into Python code,
9875 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009876 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009877
Tim Peters05eba1f2004-08-27 21:32:02 +00009878 seqlen = PySequence_Fast_GET_SIZE(fseq);
9879 /* If empty sequence, return u"". */
9880 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009881 Py_DECREF(fseq);
9882 Py_INCREF(unicode_empty);
9883 res = unicode_empty;
9884 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009885 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009886
Tim Peters05eba1f2004-08-27 21:32:02 +00009887 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009888 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009889 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009890 if (seqlen == 1) {
9891 if (PyUnicode_CheckExact(items[0])) {
9892 res = items[0];
9893 Py_INCREF(res);
9894 Py_DECREF(fseq);
9895 return res;
9896 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009897 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009898 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009899 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009900 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009901 /* Set up sep and seplen */
9902 if (separator == NULL) {
9903 /* fall back to a blank space separator */
9904 sep = PyUnicode_FromOrdinal(' ');
9905 if (!sep)
9906 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009907 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009908 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009909 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009910 else {
9911 if (!PyUnicode_Check(separator)) {
9912 PyErr_Format(PyExc_TypeError,
9913 "separator: expected str instance,"
9914 " %.80s found",
9915 Py_TYPE(separator)->tp_name);
9916 goto onError;
9917 }
9918 if (PyUnicode_READY(separator))
9919 goto onError;
9920 sep = separator;
9921 seplen = PyUnicode_GET_LENGTH(separator);
9922 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9923 /* inc refcount to keep this code path symmetric with the
9924 above case of a blank separator */
9925 Py_INCREF(sep);
9926 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009927 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009928 }
9929
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009930 /* There are at least two things to join, or else we have a subclass
9931 * of str in the sequence.
9932 * Do a pre-pass to figure out the total amount of space we'll
9933 * need (sz), and see whether all argument are strings.
9934 */
9935 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009936#ifdef Py_DEBUG
9937 use_memcpy = 0;
9938#else
9939 use_memcpy = 1;
9940#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009941 for (i = 0; i < seqlen; i++) {
9942 const Py_ssize_t old_sz = sz;
9943 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009944 if (!PyUnicode_Check(item)) {
9945 PyErr_Format(PyExc_TypeError,
9946 "sequence item %zd: expected str instance,"
9947 " %.80s found",
9948 i, Py_TYPE(item)->tp_name);
9949 goto onError;
9950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 if (PyUnicode_READY(item) == -1)
9952 goto onError;
9953 sz += PyUnicode_GET_LENGTH(item);
9954 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009955 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009956 if (i != 0)
9957 sz += seplen;
9958 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9959 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009960 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009961 goto onError;
9962 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009963 if (use_memcpy && last_obj != NULL) {
9964 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9965 use_memcpy = 0;
9966 }
9967 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 }
Tim Petersced69f82003-09-16 20:30:58 +00009969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009971 if (res == NULL)
9972 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009973
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009974 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009975#ifdef Py_DEBUG
9976 use_memcpy = 0;
9977#else
9978 if (use_memcpy) {
9979 res_data = PyUnicode_1BYTE_DATA(res);
9980 kind = PyUnicode_KIND(res);
9981 if (seplen != 0)
9982 sep_data = PyUnicode_1BYTE_DATA(sep);
9983 }
9984#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009986 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009987 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009989 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009990 if (use_memcpy) {
9991 Py_MEMCPY(res_data,
9992 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009993 kind * seplen);
9994 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009995 }
9996 else {
9997 copy_characters(res, res_offset, sep, 0, seplen);
9998 res_offset += seplen;
9999 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010001 itemlen = PyUnicode_GET_LENGTH(item);
10002 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010003 if (use_memcpy) {
10004 Py_MEMCPY(res_data,
10005 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010006 kind * itemlen);
10007 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 }
10009 else {
10010 copy_characters(res, res_offset, item, 0, itemlen);
10011 res_offset += itemlen;
10012 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010013 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010014 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010015 if (use_memcpy)
10016 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010017 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010018 else
10019 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010020
Tim Peters05eba1f2004-08-27 21:32:02 +000010021 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010023 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010027 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010029 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 return NULL;
10031}
10032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033#define FILL(kind, data, value, start, length) \
10034 do { \
10035 Py_ssize_t i_ = 0; \
10036 assert(kind != PyUnicode_WCHAR_KIND); \
10037 switch ((kind)) { \
10038 case PyUnicode_1BYTE_KIND: { \
10039 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10040 memset(to_, (unsigned char)value, length); \
10041 break; \
10042 } \
10043 case PyUnicode_2BYTE_KIND: { \
10044 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10045 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10046 break; \
10047 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010048 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10050 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10051 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010052 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 } \
10054 } \
10055 } while (0)
10056
Victor Stinner3fe55312012-01-04 00:33:50 +010010057Py_ssize_t
10058PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10059 Py_UCS4 fill_char)
10060{
10061 Py_ssize_t maxlen;
10062 enum PyUnicode_Kind kind;
10063 void *data;
10064
10065 if (!PyUnicode_Check(unicode)) {
10066 PyErr_BadInternalCall();
10067 return -1;
10068 }
10069 if (PyUnicode_READY(unicode) == -1)
10070 return -1;
10071 if (unicode_check_modifiable(unicode))
10072 return -1;
10073
10074 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10075 PyErr_SetString(PyExc_ValueError,
10076 "fill character is bigger than "
10077 "the string maximum character");
10078 return -1;
10079 }
10080
10081 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10082 length = Py_MIN(maxlen, length);
10083 if (length <= 0)
10084 return 0;
10085
10086 kind = PyUnicode_KIND(unicode);
10087 data = PyUnicode_DATA(unicode);
10088 FILL(kind, data, fill_char, start, length);
10089 return length;
10090}
10091
Victor Stinner9310abb2011-10-05 00:59:23 +020010092static PyObject *
10093pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010094 Py_ssize_t left,
10095 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 PyObject *u;
10099 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010100 int kind;
10101 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102
10103 if (left < 0)
10104 left = 0;
10105 if (right < 0)
10106 right = 0;
10107
Victor Stinnerc4b49542011-12-11 22:44:26 +010010108 if (left == 0 && right == 0)
10109 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10112 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010113 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10114 return NULL;
10115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10117 if (fill > maxchar)
10118 maxchar = fill;
10119 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010120 if (!u)
10121 return NULL;
10122
10123 kind = PyUnicode_KIND(u);
10124 data = PyUnicode_DATA(u);
10125 if (left)
10126 FILL(kind, data, fill, 0, left);
10127 if (right)
10128 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010129 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010130 assert(_PyUnicode_CheckConsistency(u, 1));
10131 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132}
10133
Alexander Belopolsky40018472011-02-26 01:02:56 +000010134PyObject *
10135PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138
10139 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010140 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010141 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010142 if (PyUnicode_READY(string) == -1) {
10143 Py_DECREF(string);
10144 return NULL;
10145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146
Benjamin Petersonead6b532011-12-20 17:23:42 -060010147 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010149 if (PyUnicode_IS_ASCII(string))
10150 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010151 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010152 PyUnicode_GET_LENGTH(string), keepends);
10153 else
10154 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010155 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010156 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 break;
10158 case PyUnicode_2BYTE_KIND:
10159 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010160 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 PyUnicode_GET_LENGTH(string), keepends);
10162 break;
10163 case PyUnicode_4BYTE_KIND:
10164 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010165 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 PyUnicode_GET_LENGTH(string), keepends);
10167 break;
10168 default:
10169 assert(0);
10170 list = 0;
10171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 Py_DECREF(string);
10173 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174}
10175
Alexander Belopolsky40018472011-02-26 01:02:56 +000010176static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010177split(PyObject *self,
10178 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010179 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 int kind1, kind2, kind;
10182 void *buf1, *buf2;
10183 Py_ssize_t len1, len2;
10184 PyObject* out;
10185
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010187 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 if (PyUnicode_READY(self) == -1)
10190 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010193 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195 if (PyUnicode_IS_ASCII(self))
10196 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010197 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010198 PyUnicode_GET_LENGTH(self), maxcount
10199 );
10200 else
10201 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010203 PyUnicode_GET_LENGTH(self), maxcount
10204 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 case PyUnicode_2BYTE_KIND:
10206 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010207 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 PyUnicode_GET_LENGTH(self), maxcount
10209 );
10210 case PyUnicode_4BYTE_KIND:
10211 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 PyUnicode_GET_LENGTH(self), maxcount
10214 );
10215 default:
10216 assert(0);
10217 return NULL;
10218 }
10219
10220 if (PyUnicode_READY(substring) == -1)
10221 return NULL;
10222
10223 kind1 = PyUnicode_KIND(self);
10224 kind2 = PyUnicode_KIND(substring);
10225 kind = kind1 > kind2 ? kind1 : kind2;
10226 buf1 = PyUnicode_DATA(self);
10227 buf2 = PyUnicode_DATA(substring);
10228 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010229 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 if (!buf1)
10231 return NULL;
10232 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010233 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 if (!buf2) {
10235 if (kind1 != kind) PyMem_Free(buf1);
10236 return NULL;
10237 }
10238 len1 = PyUnicode_GET_LENGTH(self);
10239 len2 = PyUnicode_GET_LENGTH(substring);
10240
Benjamin Petersonead6b532011-12-20 17:23:42 -060010241 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010243 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10244 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010245 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010246 else
10247 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010248 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 break;
10250 case PyUnicode_2BYTE_KIND:
10251 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010252 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 break;
10254 case PyUnicode_4BYTE_KIND:
10255 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010256 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 break;
10258 default:
10259 out = NULL;
10260 }
10261 if (kind1 != kind)
10262 PyMem_Free(buf1);
10263 if (kind2 != kind)
10264 PyMem_Free(buf2);
10265 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266}
10267
Alexander Belopolsky40018472011-02-26 01:02:56 +000010268static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010269rsplit(PyObject *self,
10270 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010271 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 int kind1, kind2, kind;
10274 void *buf1, *buf2;
10275 Py_ssize_t len1, len2;
10276 PyObject* out;
10277
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010278 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010279 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (PyUnicode_READY(self) == -1)
10282 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010285 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010287 if (PyUnicode_IS_ASCII(self))
10288 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010289 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010290 PyUnicode_GET_LENGTH(self), maxcount
10291 );
10292 else
10293 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010294 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010295 PyUnicode_GET_LENGTH(self), maxcount
10296 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 case PyUnicode_2BYTE_KIND:
10298 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010299 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 PyUnicode_GET_LENGTH(self), maxcount
10301 );
10302 case PyUnicode_4BYTE_KIND:
10303 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 PyUnicode_GET_LENGTH(self), maxcount
10306 );
10307 default:
10308 assert(0);
10309 return NULL;
10310 }
10311
10312 if (PyUnicode_READY(substring) == -1)
10313 return NULL;
10314
10315 kind1 = PyUnicode_KIND(self);
10316 kind2 = PyUnicode_KIND(substring);
10317 kind = kind1 > kind2 ? kind1 : kind2;
10318 buf1 = PyUnicode_DATA(self);
10319 buf2 = PyUnicode_DATA(substring);
10320 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010321 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (!buf1)
10323 return NULL;
10324 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010325 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (!buf2) {
10327 if (kind1 != kind) PyMem_Free(buf1);
10328 return NULL;
10329 }
10330 len1 = PyUnicode_GET_LENGTH(self);
10331 len2 = PyUnicode_GET_LENGTH(substring);
10332
Benjamin Petersonead6b532011-12-20 17:23:42 -060010333 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010335 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10336 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010338 else
10339 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 break;
10342 case PyUnicode_2BYTE_KIND:
10343 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010344 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 break;
10346 case PyUnicode_4BYTE_KIND:
10347 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 break;
10350 default:
10351 out = NULL;
10352 }
10353 if (kind1 != kind)
10354 PyMem_Free(buf1);
10355 if (kind2 != kind)
10356 PyMem_Free(buf2);
10357 return out;
10358}
10359
10360static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10362 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010364 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010366 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10367 return asciilib_find(buf1, len1, buf2, len2, offset);
10368 else
10369 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 case PyUnicode_2BYTE_KIND:
10371 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10372 case PyUnicode_4BYTE_KIND:
10373 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10374 }
10375 assert(0);
10376 return -1;
10377}
10378
10379static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010380anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10381 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010383 switch (kind) {
10384 case PyUnicode_1BYTE_KIND:
10385 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10386 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10387 else
10388 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10389 case PyUnicode_2BYTE_KIND:
10390 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10391 case PyUnicode_4BYTE_KIND:
10392 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10393 }
10394 assert(0);
10395 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010396}
10397
Alexander Belopolsky40018472011-02-26 01:02:56 +000010398static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399replace(PyObject *self, PyObject *str1,
10400 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 PyObject *u;
10403 char *sbuf = PyUnicode_DATA(self);
10404 char *buf1 = PyUnicode_DATA(str1);
10405 char *buf2 = PyUnicode_DATA(str2);
10406 int srelease = 0, release1 = 0, release2 = 0;
10407 int skind = PyUnicode_KIND(self);
10408 int kind1 = PyUnicode_KIND(str1);
10409 int kind2 = PyUnicode_KIND(str2);
10410 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10411 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10412 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010413 int mayshrink;
10414 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415
10416 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010417 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010419 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420
Victor Stinner59de0ee2011-10-07 10:01:28 +020010421 if (str1 == str2)
10422 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 if (skind < kind1)
10424 /* substring too wide to be present */
10425 goto nothing;
10426
Victor Stinner49a0a212011-10-12 23:46:10 +020010427 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10428 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10429 /* Replacing str1 with str2 may cause a maxchar reduction in the
10430 result string. */
10431 mayshrink = (maxchar_str2 < maxchar);
10432 maxchar = Py_MAX(maxchar, maxchar_str2);
10433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010435 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010437 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010439 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010440 Py_UCS4 u1, u2;
10441 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010442 Py_ssize_t index, pos;
10443 char *src;
10444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010446 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10447 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010448 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010453 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010455
10456 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10457 index = 0;
10458 src = sbuf;
10459 while (--maxcount)
10460 {
10461 pos++;
10462 src += pos * PyUnicode_KIND(self);
10463 slen -= pos;
10464 index += pos;
10465 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10466 if (pos < 0)
10467 break;
10468 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10469 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010470 }
10471 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 int rkind = skind;
10473 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010474 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (kind1 < rkind) {
10477 /* widen substring */
10478 buf1 = _PyUnicode_AsKind(str1, rkind);
10479 if (!buf1) goto error;
10480 release1 = 1;
10481 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010482 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010483 if (i < 0)
10484 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 if (rkind > kind2) {
10486 /* widen replacement */
10487 buf2 = _PyUnicode_AsKind(str2, rkind);
10488 if (!buf2) goto error;
10489 release2 = 1;
10490 }
10491 else if (rkind < kind2) {
10492 /* widen self and buf1 */
10493 rkind = kind2;
10494 if (release1) PyMem_Free(buf1);
10495 sbuf = _PyUnicode_AsKind(self, rkind);
10496 if (!sbuf) goto error;
10497 srelease = 1;
10498 buf1 = _PyUnicode_AsKind(str1, rkind);
10499 if (!buf1) goto error;
10500 release1 = 1;
10501 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010502 u = PyUnicode_New(slen, maxchar);
10503 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 assert(PyUnicode_KIND(u) == rkind);
10506 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010507
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010508 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010509 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010510 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010512 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010514
10515 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010516 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010517 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010518 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010519 if (i == -1)
10520 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010521 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010527 }
10528 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 Py_ssize_t n, i, j, ires;
10530 Py_ssize_t product, new_size;
10531 int rkind = skind;
10532 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010535 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 buf1 = _PyUnicode_AsKind(str1, rkind);
10537 if (!buf1) goto error;
10538 release1 = 1;
10539 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010540 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 if (n == 0)
10542 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010544 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 buf2 = _PyUnicode_AsKind(str2, rkind);
10546 if (!buf2) goto error;
10547 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010550 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 rkind = kind2;
10552 sbuf = _PyUnicode_AsKind(self, rkind);
10553 if (!sbuf) goto error;
10554 srelease = 1;
10555 if (release1) PyMem_Free(buf1);
10556 buf1 = _PyUnicode_AsKind(str1, rkind);
10557 if (!buf1) goto error;
10558 release1 = 1;
10559 }
10560 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10561 PyUnicode_GET_LENGTH(str1))); */
10562 product = n * (len2-len1);
10563 if ((product / (len2-len1)) != n) {
10564 PyErr_SetString(PyExc_OverflowError,
10565 "replace string is too long");
10566 goto error;
10567 }
10568 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010569 if (new_size == 0) {
10570 Py_INCREF(unicode_empty);
10571 u = unicode_empty;
10572 goto done;
10573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10575 PyErr_SetString(PyExc_OverflowError,
10576 "replace string is too long");
10577 goto error;
10578 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010579 u = PyUnicode_New(new_size, maxchar);
10580 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010582 assert(PyUnicode_KIND(u) == rkind);
10583 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 ires = i = 0;
10585 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010586 while (n-- > 0) {
10587 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010588 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010589 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010590 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010591 if (j == -1)
10592 break;
10593 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010594 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010595 memcpy(res + rkind * ires,
10596 sbuf + rkind * i,
10597 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010599 }
10600 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010602 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010604 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010611 memcpy(res + rkind * ires,
10612 sbuf + rkind * i,
10613 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010614 }
10615 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010616 /* interleave */
10617 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010618 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010620 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010622 if (--n <= 0)
10623 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010624 memcpy(res + rkind * ires,
10625 sbuf + rkind * i,
10626 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 ires++;
10628 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010630 memcpy(res + rkind * ires,
10631 sbuf + rkind * i,
10632 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010633 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010634 }
10635
10636 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010637 unicode_adjust_maxchar(&u);
10638 if (u == NULL)
10639 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010641
10642 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 if (srelease)
10644 PyMem_FREE(sbuf);
10645 if (release1)
10646 PyMem_FREE(buf1);
10647 if (release2)
10648 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010649 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651
Benjamin Peterson29060642009-01-31 22:14:21 +000010652 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (srelease)
10655 PyMem_FREE(sbuf);
10656 if (release1)
10657 PyMem_FREE(buf1);
10658 if (release2)
10659 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010660 return unicode_result_unchanged(self);
10661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 error:
10663 if (srelease && sbuf)
10664 PyMem_FREE(sbuf);
10665 if (release1 && buf1)
10666 PyMem_FREE(buf1);
10667 if (release2 && buf2)
10668 PyMem_FREE(buf2);
10669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670}
10671
10672/* --- Unicode Object Methods --------------------------------------------- */
10673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010674PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676\n\
10677Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010678characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679
10680static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010681unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010683 if (PyUnicode_READY(self) == -1)
10684 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010685 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686}
10687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010688PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690\n\
10691Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010692have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693
10694static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010695unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010697 if (PyUnicode_READY(self) == -1)
10698 return NULL;
10699 if (PyUnicode_GET_LENGTH(self) == 0)
10700 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010701 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702}
10703
Benjamin Petersond5890c82012-01-14 13:23:30 -050010704PyDoc_STRVAR(casefold__doc__,
10705 "S.casefold() -> str\n\
10706\n\
10707Return a version of S suitable for caseless comparisons.");
10708
10709static PyObject *
10710unicode_casefold(PyObject *self)
10711{
10712 if (PyUnicode_READY(self) == -1)
10713 return NULL;
10714 if (PyUnicode_IS_ASCII(self))
10715 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010716 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010717}
10718
10719
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010720/* Argument converter. Coerces to a single unicode character */
10721
10722static int
10723convert_uc(PyObject *obj, void *addr)
10724{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010726 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010727
Benjamin Peterson14339b62009-01-31 16:36:08 +000010728 uniobj = PyUnicode_FromObject(obj);
10729 if (uniobj == NULL) {
10730 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010731 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010732 return 0;
10733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010735 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010736 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010737 Py_DECREF(uniobj);
10738 return 0;
10739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010741 Py_DECREF(uniobj);
10742 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010743}
10744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010745PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010748Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010749done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750
10751static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010752unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010754 Py_ssize_t marg, left;
10755 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 Py_UCS4 fillchar = ' ';
10757
Victor Stinnere9a29352011-10-01 02:14:59 +020010758 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760
Benjamin Petersonbac79492012-01-14 13:34:47 -050010761 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762 return NULL;
10763
Victor Stinnerc4b49542011-12-11 22:44:26 +010010764 if (PyUnicode_GET_LENGTH(self) >= width)
10765 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
Victor Stinnerc4b49542011-12-11 22:44:26 +010010767 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768 left = marg / 2 + (marg & width & 1);
10769
Victor Stinner9310abb2011-10-05 00:59:23 +020010770 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771}
10772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773/* This function assumes that str1 and str2 are readied by the caller. */
10774
Marc-André Lemburge5034372000-08-08 08:04:29 +000010775static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010776unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 int kind1, kind2;
10779 void *data1, *data2;
10780 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 kind1 = PyUnicode_KIND(str1);
10783 kind2 = PyUnicode_KIND(str2);
10784 data1 = PyUnicode_DATA(str1);
10785 data2 = PyUnicode_DATA(str2);
10786 len1 = PyUnicode_GET_LENGTH(str1);
10787 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 for (i = 0; i < len1 && i < len2; ++i) {
10790 Py_UCS4 c1, c2;
10791 c1 = PyUnicode_READ(kind1, data1, i);
10792 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010793
10794 if (c1 != c2)
10795 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010796 }
10797
10798 return (len1 < len2) ? -1 : (len1 != len2);
10799}
10800
Alexander Belopolsky40018472011-02-26 01:02:56 +000010801int
10802PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10805 if (PyUnicode_READY(left) == -1 ||
10806 PyUnicode_READY(right) == -1)
10807 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010808 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010810 PyErr_Format(PyExc_TypeError,
10811 "Can't compare %.100s and %.100s",
10812 left->ob_type->tp_name,
10813 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814 return -1;
10815}
10816
Martin v. Löwis5b222132007-06-10 09:51:05 +000010817int
10818PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 Py_ssize_t i;
10821 int kind;
10822 void *data;
10823 Py_UCS4 chr;
10824
Victor Stinner910337b2011-10-03 03:20:16 +020010825 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 if (PyUnicode_READY(uni) == -1)
10827 return -1;
10828 kind = PyUnicode_KIND(uni);
10829 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010830 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10832 if (chr != str[i])
10833 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010834 /* This check keeps Python strings that end in '\0' from comparing equal
10835 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010837 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010838 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010839 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010840 return 0;
10841}
10842
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010843
Benjamin Peterson29060642009-01-31 22:14:21 +000010844#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010845 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010846
Alexander Belopolsky40018472011-02-26 01:02:56 +000010847PyObject *
10848PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010849{
10850 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010851
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010852 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10853 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 if (PyUnicode_READY(left) == -1 ||
10855 PyUnicode_READY(right) == -1)
10856 return NULL;
10857 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10858 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010859 if (op == Py_EQ) {
10860 Py_INCREF(Py_False);
10861 return Py_False;
10862 }
10863 if (op == Py_NE) {
10864 Py_INCREF(Py_True);
10865 return Py_True;
10866 }
10867 }
10868 if (left == right)
10869 result = 0;
10870 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010871 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010872
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010873 /* Convert the return value to a Boolean */
10874 switch (op) {
10875 case Py_EQ:
10876 v = TEST_COND(result == 0);
10877 break;
10878 case Py_NE:
10879 v = TEST_COND(result != 0);
10880 break;
10881 case Py_LE:
10882 v = TEST_COND(result <= 0);
10883 break;
10884 case Py_GE:
10885 v = TEST_COND(result >= 0);
10886 break;
10887 case Py_LT:
10888 v = TEST_COND(result == -1);
10889 break;
10890 case Py_GT:
10891 v = TEST_COND(result == 1);
10892 break;
10893 default:
10894 PyErr_BadArgument();
10895 return NULL;
10896 }
10897 Py_INCREF(v);
10898 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010899 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010900
Brian Curtindfc80e32011-08-10 20:28:54 -050010901 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010902}
10903
Alexander Belopolsky40018472011-02-26 01:02:56 +000010904int
10905PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010906{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010907 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 int kind1, kind2, kind;
10909 void *buf1, *buf2;
10910 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010911 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010912
10913 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010914 sub = PyUnicode_FromObject(element);
10915 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010916 PyErr_Format(PyExc_TypeError,
10917 "'in <string>' requires string as left operand, not %s",
10918 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010920 }
10921
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010923 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924 Py_DECREF(sub);
10925 return -1;
10926 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010927 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10928 Py_DECREF(sub);
10929 Py_DECREF(str);
10930 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 kind1 = PyUnicode_KIND(str);
10933 kind2 = PyUnicode_KIND(sub);
10934 kind = kind1 > kind2 ? kind1 : kind2;
10935 buf1 = PyUnicode_DATA(str);
10936 buf2 = PyUnicode_DATA(sub);
10937 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010938 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 if (!buf1) {
10940 Py_DECREF(sub);
10941 return -1;
10942 }
10943 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010944 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 if (!buf2) {
10946 Py_DECREF(sub);
10947 if (kind1 != kind) PyMem_Free(buf1);
10948 return -1;
10949 }
10950 len1 = PyUnicode_GET_LENGTH(str);
10951 len2 = PyUnicode_GET_LENGTH(sub);
10952
Benjamin Petersonead6b532011-12-20 17:23:42 -060010953 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 case PyUnicode_1BYTE_KIND:
10955 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10956 break;
10957 case PyUnicode_2BYTE_KIND:
10958 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10959 break;
10960 case PyUnicode_4BYTE_KIND:
10961 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10962 break;
10963 default:
10964 result = -1;
10965 assert(0);
10966 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010967
10968 Py_DECREF(str);
10969 Py_DECREF(sub);
10970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 if (kind1 != kind)
10972 PyMem_Free(buf1);
10973 if (kind2 != kind)
10974 PyMem_Free(buf2);
10975
Guido van Rossum403d68b2000-03-13 15:55:09 +000010976 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010977}
10978
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979/* Concat to string or Unicode object giving a new Unicode object. */
10980
Alexander Belopolsky40018472011-02-26 01:02:56 +000010981PyObject *
10982PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010985 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010986 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987
10988 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995
10996 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010997 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011001 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 }
11005
Victor Stinner488fa492011-12-12 00:01:39 +010011006 u_len = PyUnicode_GET_LENGTH(u);
11007 v_len = PyUnicode_GET_LENGTH(v);
11008 if (u_len > PY_SSIZE_T_MAX - v_len) {
11009 PyErr_SetString(PyExc_OverflowError,
11010 "strings are too large to concat");
11011 goto onError;
11012 }
11013 new_len = u_len + v_len;
11014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011016 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11017 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011020 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011023 copy_characters(w, 0, u, 0, u_len);
11024 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025 Py_DECREF(u);
11026 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011027 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 Py_XDECREF(u);
11032 Py_XDECREF(v);
11033 return NULL;
11034}
11035
Walter Dörwald1ab83302007-05-18 17:15:44 +000011036void
Victor Stinner23e56682011-10-03 03:54:37 +020011037PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011038{
Victor Stinner23e56682011-10-03 03:54:37 +020011039 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011040 Py_UCS4 maxchar, maxchar2;
11041 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011042
11043 if (p_left == NULL) {
11044 if (!PyErr_Occurred())
11045 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011046 return;
11047 }
Victor Stinner23e56682011-10-03 03:54:37 +020011048 left = *p_left;
11049 if (right == NULL || !PyUnicode_Check(left)) {
11050 if (!PyErr_Occurred())
11051 PyErr_BadInternalCall();
11052 goto error;
11053 }
11054
Benjamin Petersonbac79492012-01-14 13:34:47 -050011055 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011056 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011057 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011058 goto error;
11059
Victor Stinner488fa492011-12-12 00:01:39 +010011060 /* Shortcuts */
11061 if (left == unicode_empty) {
11062 Py_DECREF(left);
11063 Py_INCREF(right);
11064 *p_left = right;
11065 return;
11066 }
11067 if (right == unicode_empty)
11068 return;
11069
11070 left_len = PyUnicode_GET_LENGTH(left);
11071 right_len = PyUnicode_GET_LENGTH(right);
11072 if (left_len > PY_SSIZE_T_MAX - right_len) {
11073 PyErr_SetString(PyExc_OverflowError,
11074 "strings are too large to concat");
11075 goto error;
11076 }
11077 new_len = left_len + right_len;
11078
11079 if (unicode_modifiable(left)
11080 && PyUnicode_CheckExact(right)
11081 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011082 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11083 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011084 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011085 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011086 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11087 {
11088 /* append inplace */
11089 if (unicode_resize(p_left, new_len) != 0) {
11090 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11091 * deallocated so it cannot be put back into
11092 * 'variable'. The MemoryError is raised when there
11093 * is no value in 'variable', which might (very
11094 * remotely) be a cause of incompatibilities.
11095 */
11096 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011097 }
Victor Stinner488fa492011-12-12 00:01:39 +010011098 /* copy 'right' into the newly allocated area of 'left' */
11099 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011100 }
Victor Stinner488fa492011-12-12 00:01:39 +010011101 else {
11102 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11103 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11104 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011105
Victor Stinner488fa492011-12-12 00:01:39 +010011106 /* Concat the two Unicode strings */
11107 res = PyUnicode_New(new_len, maxchar);
11108 if (res == NULL)
11109 goto error;
11110 copy_characters(res, 0, left, 0, left_len);
11111 copy_characters(res, left_len, right, 0, right_len);
11112 Py_DECREF(left);
11113 *p_left = res;
11114 }
11115 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011116 return;
11117
11118error:
Victor Stinner488fa492011-12-12 00:01:39 +010011119 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011120}
11121
11122void
11123PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11124{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011125 PyUnicode_Append(pleft, right);
11126 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011127}
11128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011129PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011130 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011132Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011133string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011134interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
11136static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011137unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011139 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011140 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011141 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 int kind1, kind2, kind;
11144 void *buf1, *buf2;
11145 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146
Jesus Ceaac451502011-04-20 17:09:23 +020011147 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11148 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011149 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 kind1 = PyUnicode_KIND(self);
11152 kind2 = PyUnicode_KIND(substring);
11153 kind = kind1 > kind2 ? kind1 : kind2;
11154 buf1 = PyUnicode_DATA(self);
11155 buf2 = PyUnicode_DATA(substring);
11156 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011157 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 if (!buf1) {
11159 Py_DECREF(substring);
11160 return NULL;
11161 }
11162 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011163 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 if (!buf2) {
11165 Py_DECREF(substring);
11166 if (kind1 != kind) PyMem_Free(buf1);
11167 return NULL;
11168 }
11169 len1 = PyUnicode_GET_LENGTH(self);
11170 len2 = PyUnicode_GET_LENGTH(substring);
11171
11172 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011173 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 case PyUnicode_1BYTE_KIND:
11175 iresult = ucs1lib_count(
11176 ((Py_UCS1*)buf1) + start, end - start,
11177 buf2, len2, PY_SSIZE_T_MAX
11178 );
11179 break;
11180 case PyUnicode_2BYTE_KIND:
11181 iresult = ucs2lib_count(
11182 ((Py_UCS2*)buf1) + start, end - start,
11183 buf2, len2, PY_SSIZE_T_MAX
11184 );
11185 break;
11186 case PyUnicode_4BYTE_KIND:
11187 iresult = ucs4lib_count(
11188 ((Py_UCS4*)buf1) + start, end - start,
11189 buf2, len2, PY_SSIZE_T_MAX
11190 );
11191 break;
11192 default:
11193 assert(0); iresult = 0;
11194 }
11195
11196 result = PyLong_FromSsize_t(iresult);
11197
11198 if (kind1 != kind)
11199 PyMem_Free(buf1);
11200 if (kind2 != kind)
11201 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202
11203 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011204
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 return result;
11206}
11207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011208PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011209 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011211Encode S using the codec registered for encoding. Default encoding\n\
11212is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011213handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011214a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11215'xmlcharrefreplace' as well as any other name registered with\n\
11216codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217
11218static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011219unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011221 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222 char *encoding = NULL;
11223 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011224
Benjamin Peterson308d6372009-09-18 21:42:35 +000011225 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11226 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011228 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011229}
11230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011231PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233\n\
11234Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011235If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236
11237static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011238unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011240 Py_ssize_t i, j, line_pos, src_len, incr;
11241 Py_UCS4 ch;
11242 PyObject *u;
11243 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011245 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011246 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
11248 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250
Antoine Pitrou22425222011-10-04 19:10:51 +020011251 if (PyUnicode_READY(self) == -1)
11252 return NULL;
11253
Thomas Wouters7e474022000-07-16 12:04:32 +000011254 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011255 src_len = PyUnicode_GET_LENGTH(self);
11256 i = j = line_pos = 0;
11257 kind = PyUnicode_KIND(self);
11258 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011259 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011260 for (; i < src_len; i++) {
11261 ch = PyUnicode_READ(kind, src_data, i);
11262 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011263 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011265 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011267 goto overflow;
11268 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011270 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011274 goto overflow;
11275 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 if (ch == '\n' || ch == '\r')
11278 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011280 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011281 if (!found)
11282 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011283
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011285 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286 if (!u)
11287 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011288 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289
Antoine Pitroue71d5742011-10-04 15:55:09 +020011290 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
Antoine Pitroue71d5742011-10-04 15:55:09 +020011292 for (; i < src_len; i++) {
11293 ch = PyUnicode_READ(kind, src_data, i);
11294 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011296 incr = tabsize - (line_pos % tabsize);
11297 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011298 FILL(kind, dest_data, ' ', j, incr);
11299 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011301 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011303 line_pos++;
11304 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011305 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011306 if (ch == '\n' || ch == '\r')
11307 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011309 }
11310 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011311 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011312
Antoine Pitroue71d5742011-10-04 15:55:09 +020011313 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011314 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316}
11317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011318PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320\n\
11321Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011322such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323arguments start and end are interpreted as in slice notation.\n\
11324\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011325Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
11327static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011330 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011331 Py_ssize_t start;
11332 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011333 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334
Jesus Ceaac451502011-04-20 17:09:23 +020011335 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11336 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 if (PyUnicode_READY(self) == -1)
11340 return NULL;
11341 if (PyUnicode_READY(substring) == -1)
11342 return NULL;
11343
Victor Stinner7931d9a2011-11-04 00:22:48 +010011344 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345
11346 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 if (result == -2)
11349 return NULL;
11350
Christian Heimes217cfd12007-12-02 14:31:20 +000011351 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352}
11353
11354static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011355unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011357 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11358 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361}
11362
Guido van Rossumc2504932007-09-18 19:42:40 +000011363/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011364 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011365static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011366unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367{
Guido van Rossumc2504932007-09-18 19:42:40 +000011368 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011369 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011370
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011371#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011372 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011373#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (_PyUnicode_HASH(self) != -1)
11375 return _PyUnicode_HASH(self);
11376 if (PyUnicode_READY(self) == -1)
11377 return -1;
11378 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011379 /*
11380 We make the hash of the empty string be 0, rather than using
11381 (prefix ^ suffix), since this slightly obfuscates the hash secret
11382 */
11383 if (len == 0) {
11384 _PyUnicode_HASH(self) = 0;
11385 return 0;
11386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387
11388 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011389#define HASH(P) \
11390 x ^= (Py_uhash_t) *P << 7; \
11391 while (--len >= 0) \
11392 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393
Georg Brandl2fb477c2012-02-21 00:33:36 +010011394 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 switch (PyUnicode_KIND(self)) {
11396 case PyUnicode_1BYTE_KIND: {
11397 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11398 HASH(c);
11399 break;
11400 }
11401 case PyUnicode_2BYTE_KIND: {
11402 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11403 HASH(s);
11404 break;
11405 }
11406 default: {
11407 Py_UCS4 *l;
11408 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11409 "Impossible switch case in unicode_hash");
11410 l = PyUnicode_4BYTE_DATA(self);
11411 HASH(l);
11412 break;
11413 }
11414 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011415 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11416 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417
Guido van Rossumc2504932007-09-18 19:42:40 +000011418 if (x == -1)
11419 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011421 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011425PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011428Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
11430static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011433 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011434 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011435 Py_ssize_t start;
11436 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
Jesus Ceaac451502011-04-20 17:09:23 +020011438 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11439 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 if (PyUnicode_READY(self) == -1)
11443 return NULL;
11444 if (PyUnicode_READY(substring) == -1)
11445 return NULL;
11446
Victor Stinner7931d9a2011-11-04 00:22:48 +010011447 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
11449 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 if (result == -2)
11452 return NULL;
11453
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 if (result < 0) {
11455 PyErr_SetString(PyExc_ValueError, "substring not found");
11456 return NULL;
11457 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011458
Christian Heimes217cfd12007-12-02 14:31:20 +000011459 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460}
11461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011462PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011463 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011465Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011466at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
11468static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011469unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 Py_ssize_t i, length;
11472 int kind;
11473 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 int cased;
11475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 if (PyUnicode_READY(self) == -1)
11477 return NULL;
11478 length = PyUnicode_GET_LENGTH(self);
11479 kind = PyUnicode_KIND(self);
11480 data = PyUnicode_DATA(self);
11481
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 if (length == 1)
11484 return PyBool_FromLong(
11485 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011487 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 for (i = 0; i < length; i++) {
11493 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011494
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11496 return PyBool_FromLong(0);
11497 else if (!cased && Py_UNICODE_ISLOWER(ch))
11498 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011500 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501}
11502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011503PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011506Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011507at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
11509static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011510unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 Py_ssize_t i, length;
11513 int kind;
11514 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 int cased;
11516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (PyUnicode_READY(self) == -1)
11518 return NULL;
11519 length = PyUnicode_GET_LENGTH(self);
11520 kind = PyUnicode_KIND(self);
11521 data = PyUnicode_DATA(self);
11522
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (length == 1)
11525 return PyBool_FromLong(
11526 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011528 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011531
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 for (i = 0; i < length; i++) {
11534 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011535
Benjamin Peterson29060642009-01-31 22:14:21 +000011536 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11537 return PyBool_FromLong(0);
11538 else if (!cased && Py_UNICODE_ISUPPER(ch))
11539 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011541 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542}
11543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011544PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011547Return True if S is a titlecased string and there is at least one\n\
11548character in S, i.e. upper- and titlecase characters may only\n\
11549follow uncased characters and lowercase characters only cased ones.\n\
11550Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
11552static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011553unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 Py_ssize_t i, length;
11556 int kind;
11557 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 int cased, previous_is_cased;
11559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 if (PyUnicode_READY(self) == -1)
11561 return NULL;
11562 length = PyUnicode_GET_LENGTH(self);
11563 kind = PyUnicode_KIND(self);
11564 data = PyUnicode_DATA(self);
11565
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 if (length == 1) {
11568 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11569 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11570 (Py_UNICODE_ISUPPER(ch) != 0));
11571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011573 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011576
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 cased = 0;
11578 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 for (i = 0; i < length; i++) {
11580 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011581
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11583 if (previous_is_cased)
11584 return PyBool_FromLong(0);
11585 previous_is_cased = 1;
11586 cased = 1;
11587 }
11588 else if (Py_UNICODE_ISLOWER(ch)) {
11589 if (!previous_is_cased)
11590 return PyBool_FromLong(0);
11591 previous_is_cased = 1;
11592 cased = 1;
11593 }
11594 else
11595 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011597 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598}
11599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011600PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011603Return True if all characters in S are whitespace\n\
11604and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
11606static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011607unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 Py_ssize_t i, length;
11610 int kind;
11611 void *data;
11612
11613 if (PyUnicode_READY(self) == -1)
11614 return NULL;
11615 length = PyUnicode_GET_LENGTH(self);
11616 kind = PyUnicode_KIND(self);
11617 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (length == 1)
11621 return PyBool_FromLong(
11622 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011624 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 for (i = 0; i < length; i++) {
11629 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011630 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011633 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634}
11635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011636PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011638\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011639Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011640and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011641
11642static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011643unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 Py_ssize_t i, length;
11646 int kind;
11647 void *data;
11648
11649 if (PyUnicode_READY(self) == -1)
11650 return NULL;
11651 length = PyUnicode_GET_LENGTH(self);
11652 kind = PyUnicode_KIND(self);
11653 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011654
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011655 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 if (length == 1)
11657 return PyBool_FromLong(
11658 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011659
11660 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011662 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 for (i = 0; i < length; i++) {
11665 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011666 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011667 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011668 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011669}
11670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011673\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011674Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011675and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011676
11677static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011678unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 int kind;
11681 void *data;
11682 Py_ssize_t len, i;
11683
11684 if (PyUnicode_READY(self) == -1)
11685 return NULL;
11686
11687 kind = PyUnicode_KIND(self);
11688 data = PyUnicode_DATA(self);
11689 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011690
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011691 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (len == 1) {
11693 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11694 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11695 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011696
11697 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011699 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 for (i = 0; i < len; i++) {
11702 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011703 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011704 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011705 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011706 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011707}
11708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011709PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011712Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011713False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714
11715static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011716unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 Py_ssize_t i, length;
11719 int kind;
11720 void *data;
11721
11722 if (PyUnicode_READY(self) == -1)
11723 return NULL;
11724 length = PyUnicode_GET_LENGTH(self);
11725 kind = PyUnicode_KIND(self);
11726 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 if (length == 1)
11730 return PyBool_FromLong(
11731 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011733 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011735 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 for (i = 0; i < length; i++) {
11738 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011741 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742}
11743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011744PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011747Return True if all characters in S are digits\n\
11748and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
11750static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011751unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 Py_ssize_t i, length;
11754 int kind;
11755 void *data;
11756
11757 if (PyUnicode_READY(self) == -1)
11758 return NULL;
11759 length = PyUnicode_GET_LENGTH(self);
11760 kind = PyUnicode_KIND(self);
11761 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (length == 1) {
11765 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11766 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011769 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 for (i = 0; i < length; i++) {
11774 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011777 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778}
11779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011780PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011783Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011784False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
11786static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011787unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 Py_ssize_t i, length;
11790 int kind;
11791 void *data;
11792
11793 if (PyUnicode_READY(self) == -1)
11794 return NULL;
11795 length = PyUnicode_GET_LENGTH(self);
11796 kind = PyUnicode_KIND(self);
11797 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (length == 1)
11801 return PyBool_FromLong(
11802 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011804 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 for (i = 0; i < length; i++) {
11809 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011812 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813}
11814
Martin v. Löwis47383402007-08-15 07:32:56 +000011815int
11816PyUnicode_IsIdentifier(PyObject *self)
11817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 int kind;
11819 void *data;
11820 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011821 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 if (PyUnicode_READY(self) == -1) {
11824 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 }
11827
11828 /* Special case for empty strings */
11829 if (PyUnicode_GET_LENGTH(self) == 0)
11830 return 0;
11831 kind = PyUnicode_KIND(self);
11832 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011833
11834 /* PEP 3131 says that the first character must be in
11835 XID_Start and subsequent characters in XID_Continue,
11836 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011837 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011838 letters, digits, underscore). However, given the current
11839 definition of XID_Start and XID_Continue, it is sufficient
11840 to check just for these, except that _ must be allowed
11841 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011843 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011844 return 0;
11845
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011846 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011849 return 1;
11850}
11851
11852PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011854\n\
11855Return True if S is a valid identifier according\n\
11856to the language definition.");
11857
11858static PyObject*
11859unicode_isidentifier(PyObject *self)
11860{
11861 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11862}
11863
Georg Brandl559e5d72008-06-11 18:37:52 +000011864PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011866\n\
11867Return True if all characters in S are considered\n\
11868printable in repr() or S is empty, False otherwise.");
11869
11870static PyObject*
11871unicode_isprintable(PyObject *self)
11872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 Py_ssize_t i, length;
11874 int kind;
11875 void *data;
11876
11877 if (PyUnicode_READY(self) == -1)
11878 return NULL;
11879 length = PyUnicode_GET_LENGTH(self);
11880 kind = PyUnicode_KIND(self);
11881 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011882
11883 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 if (length == 1)
11885 return PyBool_FromLong(
11886 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 for (i = 0; i < length; i++) {
11889 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011890 Py_RETURN_FALSE;
11891 }
11892 }
11893 Py_RETURN_TRUE;
11894}
11895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011896PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011897 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898\n\
11899Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011900iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901
11902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011903unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011905 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906}
11907
Martin v. Löwis18e16552006-02-15 17:27:45 +000011908static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011909unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (PyUnicode_READY(self) == -1)
11912 return -1;
11913 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914}
11915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011916PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011917 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011919Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011920done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921
11922static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011923unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011925 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 Py_UCS4 fillchar = ' ';
11927
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011928 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929 return NULL;
11930
Benjamin Petersonbac79492012-01-14 13:34:47 -050011931 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
Victor Stinnerc4b49542011-12-11 22:44:26 +010011934 if (PyUnicode_GET_LENGTH(self) >= width)
11935 return unicode_result_unchanged(self);
11936
11937 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938}
11939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011940PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011943Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944
11945static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011946unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011948 if (PyUnicode_READY(self) == -1)
11949 return NULL;
11950 if (PyUnicode_IS_ASCII(self))
11951 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011952 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953}
11954
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011955#define LEFTSTRIP 0
11956#define RIGHTSTRIP 1
11957#define BOTHSTRIP 2
11958
11959/* Arrays indexed by above */
11960static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11961
11962#define STRIPNAME(i) (stripformat[i]+3)
11963
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011964/* externally visible for str.strip(unicode) */
11965PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011966_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 void *data;
11969 int kind;
11970 Py_ssize_t i, j, len;
11971 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11974 return NULL;
11975
11976 kind = PyUnicode_KIND(self);
11977 data = PyUnicode_DATA(self);
11978 len = PyUnicode_GET_LENGTH(self);
11979 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11980 PyUnicode_DATA(sepobj),
11981 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011982
Benjamin Peterson14339b62009-01-31 16:36:08 +000011983 i = 0;
11984 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 while (i < len &&
11986 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 i++;
11988 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011989 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011990
Benjamin Peterson14339b62009-01-31 16:36:08 +000011991 j = len;
11992 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 do {
11994 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 } while (j >= i &&
11996 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011999
Victor Stinner7931d9a2011-11-04 00:22:48 +010012000 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001}
12002
12003PyObject*
12004PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12005{
12006 unsigned char *data;
12007 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012008 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009
Victor Stinnerde636f32011-10-01 03:55:54 +020012010 if (PyUnicode_READY(self) == -1)
12011 return NULL;
12012
12013 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
12014
Victor Stinner12bab6d2011-10-01 01:53:49 +020012015 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010012016 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017
Victor Stinner12bab6d2011-10-01 01:53:49 +020012018 length = end - start;
12019 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012020 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021
Victor Stinnerde636f32011-10-01 03:55:54 +020012022 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012023 PyErr_SetString(PyExc_IndexError, "string index out of range");
12024 return NULL;
12025 }
12026
Victor Stinnerb9275c12011-10-05 14:01:42 +020012027 if (PyUnicode_IS_ASCII(self)) {
12028 kind = PyUnicode_KIND(self);
12029 data = PyUnicode_1BYTE_DATA(self);
12030 return unicode_fromascii(data + start, length);
12031 }
12032 else {
12033 kind = PyUnicode_KIND(self);
12034 data = PyUnicode_1BYTE_DATA(self);
12035 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012036 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012037 length);
12038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040
12041static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012042do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 int kind;
12045 void *data;
12046 Py_ssize_t len, i, j;
12047
12048 if (PyUnicode_READY(self) == -1)
12049 return NULL;
12050
12051 kind = PyUnicode_KIND(self);
12052 data = PyUnicode_DATA(self);
12053 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012054
Benjamin Peterson14339b62009-01-31 16:36:08 +000012055 i = 0;
12056 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012058 i++;
12059 }
12060 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012061
Benjamin Peterson14339b62009-01-31 16:36:08 +000012062 j = len;
12063 if (striptype != LEFTSTRIP) {
12064 do {
12065 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012067 j++;
12068 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012069
Victor Stinner7931d9a2011-11-04 00:22:48 +010012070 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071}
12072
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012073
12074static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012075do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012076{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012077 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012078
Benjamin Peterson14339b62009-01-31 16:36:08 +000012079 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12080 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012081
Benjamin Peterson14339b62009-01-31 16:36:08 +000012082 if (sep != NULL && sep != Py_None) {
12083 if (PyUnicode_Check(sep))
12084 return _PyUnicode_XStrip(self, striptype, sep);
12085 else {
12086 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 "%s arg must be None or str",
12088 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012089 return NULL;
12090 }
12091 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012092
Benjamin Peterson14339b62009-01-31 16:36:08 +000012093 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012094}
12095
12096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012097PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012099\n\
12100Return a copy of the string S with leading and trailing\n\
12101whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012102If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012103
12104static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012105unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012106{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012107 if (PyTuple_GET_SIZE(args) == 0)
12108 return do_strip(self, BOTHSTRIP); /* Common case */
12109 else
12110 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012111}
12112
12113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012114PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116\n\
12117Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012118If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119
12120static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012121unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012122{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012123 if (PyTuple_GET_SIZE(args) == 0)
12124 return do_strip(self, LEFTSTRIP); /* Common case */
12125 else
12126 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012127}
12128
12129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012130PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012132\n\
12133Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012134If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135
12136static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012137unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012139 if (PyTuple_GET_SIZE(args) == 0)
12140 return do_strip(self, RIGHTSTRIP); /* Common case */
12141 else
12142 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143}
12144
12145
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012147unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012149 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151
Georg Brandl222de0f2009-04-12 12:01:50 +000012152 if (len < 1) {
12153 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012154 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
Victor Stinnerc4b49542011-12-11 22:44:26 +010012157 /* no repeat, return original string */
12158 if (len == 1)
12159 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012160
Benjamin Petersonbac79492012-01-14 13:34:47 -050012161 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 return NULL;
12163
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012164 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012165 PyErr_SetString(PyExc_OverflowError,
12166 "repeated string is too long");
12167 return NULL;
12168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012170
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012171 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172 if (!u)
12173 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012174 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 if (PyUnicode_GET_LENGTH(str) == 1) {
12177 const int kind = PyUnicode_KIND(str);
12178 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012179 if (kind == PyUnicode_1BYTE_KIND) {
12180 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012181 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012182 }
12183 else if (kind == PyUnicode_2BYTE_KIND) {
12184 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012185 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012186 ucs2[n] = fill_char;
12187 } else {
12188 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12189 assert(kind == PyUnicode_4BYTE_KIND);
12190 for (n = 0; n < len; ++n)
12191 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 }
12194 else {
12195 /* number of characters copied this far */
12196 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012197 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 char *to = (char *) PyUnicode_DATA(u);
12199 Py_MEMCPY(to, PyUnicode_DATA(str),
12200 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012201 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 n = (done <= nchars-done) ? done : nchars-done;
12203 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012204 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206 }
12207
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012208 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012209 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210}
12211
Alexander Belopolsky40018472011-02-26 01:02:56 +000012212PyObject *
12213PyUnicode_Replace(PyObject *obj,
12214 PyObject *subobj,
12215 PyObject *replobj,
12216 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217{
12218 PyObject *self;
12219 PyObject *str1;
12220 PyObject *str2;
12221 PyObject *result;
12222
12223 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012224 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012227 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012228 Py_DECREF(self);
12229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230 }
12231 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012232 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 Py_DECREF(self);
12234 Py_DECREF(str1);
12235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012237 if (PyUnicode_READY(self) == -1 ||
12238 PyUnicode_READY(str1) == -1 ||
12239 PyUnicode_READY(str2) == -1)
12240 result = NULL;
12241 else
12242 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243 Py_DECREF(self);
12244 Py_DECREF(str1);
12245 Py_DECREF(str2);
12246 return result;
12247}
12248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012249PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012250 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251\n\
12252Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012253old replaced by new. If the optional argument count is\n\
12254given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255
12256static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 PyObject *str1;
12260 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012261 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262 PyObject *result;
12263
Martin v. Löwis18e16552006-02-15 17:27:45 +000012264 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012266 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012267 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012269 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 return NULL;
12271 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012272 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 Py_DECREF(str1);
12274 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012275 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012276 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12277 result = NULL;
12278 else
12279 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280
12281 Py_DECREF(str1);
12282 Py_DECREF(str2);
12283 return result;
12284}
12285
Alexander Belopolsky40018472011-02-26 01:02:56 +000012286static PyObject *
12287unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012289 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 Py_ssize_t isize;
12291 Py_ssize_t osize, squote, dquote, i, o;
12292 Py_UCS4 max, quote;
12293 int ikind, okind;
12294 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012297 return NULL;
12298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 isize = PyUnicode_GET_LENGTH(unicode);
12300 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 /* Compute length of output, quote characters, and
12303 maximum character */
12304 osize = 2; /* quotes */
12305 max = 127;
12306 squote = dquote = 0;
12307 ikind = PyUnicode_KIND(unicode);
12308 for (i = 0; i < isize; i++) {
12309 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12310 switch (ch) {
12311 case '\'': squote++; osize++; break;
12312 case '"': dquote++; osize++; break;
12313 case '\\': case '\t': case '\r': case '\n':
12314 osize += 2; break;
12315 default:
12316 /* Fast-path ASCII */
12317 if (ch < ' ' || ch == 0x7f)
12318 osize += 4; /* \xHH */
12319 else if (ch < 0x7f)
12320 osize++;
12321 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12322 osize++;
12323 max = ch > max ? ch : max;
12324 }
12325 else if (ch < 0x100)
12326 osize += 4; /* \xHH */
12327 else if (ch < 0x10000)
12328 osize += 6; /* \uHHHH */
12329 else
12330 osize += 10; /* \uHHHHHHHH */
12331 }
12332 }
12333
12334 quote = '\'';
12335 if (squote) {
12336 if (dquote)
12337 /* Both squote and dquote present. Use squote,
12338 and escape them */
12339 osize += squote;
12340 else
12341 quote = '"';
12342 }
12343
12344 repr = PyUnicode_New(osize, max);
12345 if (repr == NULL)
12346 return NULL;
12347 okind = PyUnicode_KIND(repr);
12348 odata = PyUnicode_DATA(repr);
12349
12350 PyUnicode_WRITE(okind, odata, 0, quote);
12351 PyUnicode_WRITE(okind, odata, osize-1, quote);
12352
12353 for (i = 0, o = 1; i < isize; i++) {
12354 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012355
12356 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 if ((ch == quote) || (ch == '\\')) {
12358 PyUnicode_WRITE(okind, odata, o++, '\\');
12359 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012360 continue;
12361 }
12362
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012364 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 PyUnicode_WRITE(okind, odata, o++, '\\');
12366 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012367 }
12368 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 PyUnicode_WRITE(okind, odata, o++, '\\');
12370 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012371 }
12372 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 PyUnicode_WRITE(okind, odata, o++, '\\');
12374 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012375 }
12376
12377 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012378 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 PyUnicode_WRITE(okind, odata, o++, '\\');
12380 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012381 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12382 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012383 }
12384
Georg Brandl559e5d72008-06-11 18:37:52 +000012385 /* Copy ASCII characters as-is */
12386 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012388 }
12389
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012391 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012392 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012393 (categories Z* and C* except ASCII space)
12394 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012396 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 if (ch <= 0xff) {
12398 PyUnicode_WRITE(okind, odata, o++, '\\');
12399 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012400 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12401 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012402 }
12403 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 else if (ch >= 0x10000) {
12405 PyUnicode_WRITE(okind, odata, o++, '\\');
12406 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012407 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12408 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12409 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12410 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12411 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12412 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12413 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12414 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012415 }
12416 /* Map 16-bit characters to '\uxxxx' */
12417 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 PyUnicode_WRITE(okind, odata, o++, '\\');
12419 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012420 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12421 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12422 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12423 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012424 }
12425 }
12426 /* Copy characters as-is */
12427 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012429 }
12430 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012433 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012434 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435}
12436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012437PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012439\n\
12440Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012441such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442arguments start and end are interpreted as in slice notation.\n\
12443\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012444Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445
12446static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012449 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012450 Py_ssize_t start;
12451 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012452 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453
Jesus Ceaac451502011-04-20 17:09:23 +020012454 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12455 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 if (PyUnicode_READY(self) == -1)
12459 return NULL;
12460 if (PyUnicode_READY(substring) == -1)
12461 return NULL;
12462
Victor Stinner7931d9a2011-11-04 00:22:48 +010012463 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464
12465 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 if (result == -2)
12468 return NULL;
12469
Christian Heimes217cfd12007-12-02 14:31:20 +000012470 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471}
12472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012473PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012474 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012476Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477
12478static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012481 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012482 Py_ssize_t start;
12483 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012484 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485
Jesus Ceaac451502011-04-20 17:09:23 +020012486 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12487 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 if (PyUnicode_READY(self) == -1)
12491 return NULL;
12492 if (PyUnicode_READY(substring) == -1)
12493 return NULL;
12494
Victor Stinner7931d9a2011-11-04 00:22:48 +010012495 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496
12497 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 if (result == -2)
12500 return NULL;
12501
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502 if (result < 0) {
12503 PyErr_SetString(PyExc_ValueError, "substring not found");
12504 return NULL;
12505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506
Christian Heimes217cfd12007-12-02 14:31:20 +000012507 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508}
12509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012510PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012511 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012513Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012514done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
12516static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012517unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012519 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 Py_UCS4 fillchar = ' ';
12521
Victor Stinnere9a29352011-10-01 02:14:59 +020012522 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012524
Benjamin Petersonbac79492012-01-14 13:34:47 -050012525 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526 return NULL;
12527
Victor Stinnerc4b49542011-12-11 22:44:26 +010012528 if (PyUnicode_GET_LENGTH(self) >= width)
12529 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
Victor Stinnerc4b49542011-12-11 22:44:26 +010012531 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532}
12533
Alexander Belopolsky40018472011-02-26 01:02:56 +000012534PyObject *
12535PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536{
12537 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012538
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539 s = PyUnicode_FromObject(s);
12540 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012541 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012542 if (sep != NULL) {
12543 sep = PyUnicode_FromObject(sep);
12544 if (sep == NULL) {
12545 Py_DECREF(s);
12546 return NULL;
12547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548 }
12549
Victor Stinner9310abb2011-10-05 00:59:23 +020012550 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551
12552 Py_DECREF(s);
12553 Py_XDECREF(sep);
12554 return result;
12555}
12556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012557PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012558 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559\n\
12560Return a list of the words in S, using sep as the\n\
12561delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012562splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012563whitespace string is a separator and empty strings are\n\
12564removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565
12566static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012567unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012569 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012571 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012573 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12574 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575 return NULL;
12576
12577 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012580 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012582 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583}
12584
Thomas Wouters477c8d52006-05-27 19:21:47 +000012585PyObject *
12586PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12587{
12588 PyObject* str_obj;
12589 PyObject* sep_obj;
12590 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 int kind1, kind2, kind;
12592 void *buf1 = NULL, *buf2 = NULL;
12593 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012594
12595 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012596 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012597 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012598 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012599 if (!sep_obj) {
12600 Py_DECREF(str_obj);
12601 return NULL;
12602 }
12603 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12604 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012605 Py_DECREF(str_obj);
12606 return NULL;
12607 }
12608
Victor Stinner14f8f022011-10-05 20:58:25 +020012609 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012611 kind = Py_MAX(kind1, kind2);
12612 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012614 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 if (!buf1)
12616 goto onError;
12617 buf2 = PyUnicode_DATA(sep_obj);
12618 if (kind2 != kind)
12619 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12620 if (!buf2)
12621 goto onError;
12622 len1 = PyUnicode_GET_LENGTH(str_obj);
12623 len2 = PyUnicode_GET_LENGTH(sep_obj);
12624
Benjamin Petersonead6b532011-12-20 17:23:42 -060012625 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012627 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12628 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12629 else
12630 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 break;
12632 case PyUnicode_2BYTE_KIND:
12633 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12634 break;
12635 case PyUnicode_4BYTE_KIND:
12636 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12637 break;
12638 default:
12639 assert(0);
12640 out = 0;
12641 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012642
12643 Py_DECREF(sep_obj);
12644 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 if (kind1 != kind)
12646 PyMem_Free(buf1);
12647 if (kind2 != kind)
12648 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012649
12650 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 onError:
12652 Py_DECREF(sep_obj);
12653 Py_DECREF(str_obj);
12654 if (kind1 != kind && buf1)
12655 PyMem_Free(buf1);
12656 if (kind2 != kind && buf2)
12657 PyMem_Free(buf2);
12658 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012659}
12660
12661
12662PyObject *
12663PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12664{
12665 PyObject* str_obj;
12666 PyObject* sep_obj;
12667 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 int kind1, kind2, kind;
12669 void *buf1 = NULL, *buf2 = NULL;
12670 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012671
12672 str_obj = PyUnicode_FromObject(str_in);
12673 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012674 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012675 sep_obj = PyUnicode_FromObject(sep_in);
12676 if (!sep_obj) {
12677 Py_DECREF(str_obj);
12678 return NULL;
12679 }
12680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 kind1 = PyUnicode_KIND(str_in);
12682 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012683 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 buf1 = PyUnicode_DATA(str_in);
12685 if (kind1 != kind)
12686 buf1 = _PyUnicode_AsKind(str_in, kind);
12687 if (!buf1)
12688 goto onError;
12689 buf2 = PyUnicode_DATA(sep_obj);
12690 if (kind2 != kind)
12691 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12692 if (!buf2)
12693 goto onError;
12694 len1 = PyUnicode_GET_LENGTH(str_obj);
12695 len2 = PyUnicode_GET_LENGTH(sep_obj);
12696
Benjamin Petersonead6b532011-12-20 17:23:42 -060012697 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012699 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12700 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12701 else
12702 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 break;
12704 case PyUnicode_2BYTE_KIND:
12705 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12706 break;
12707 case PyUnicode_4BYTE_KIND:
12708 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12709 break;
12710 default:
12711 assert(0);
12712 out = 0;
12713 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012714
12715 Py_DECREF(sep_obj);
12716 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 if (kind1 != kind)
12718 PyMem_Free(buf1);
12719 if (kind2 != kind)
12720 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012721
12722 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 onError:
12724 Py_DECREF(sep_obj);
12725 Py_DECREF(str_obj);
12726 if (kind1 != kind && buf1)
12727 PyMem_Free(buf1);
12728 if (kind2 != kind && buf2)
12729 PyMem_Free(buf2);
12730 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012731}
12732
12733PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012734 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012735\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012736Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012737the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012738found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012739
12740static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012741unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012742{
Victor Stinner9310abb2011-10-05 00:59:23 +020012743 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012744}
12745
12746PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012747 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012748\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012749Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012750the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012751separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012752
12753static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012754unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012755{
Victor Stinner9310abb2011-10-05 00:59:23 +020012756 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757}
12758
Alexander Belopolsky40018472011-02-26 01:02:56 +000012759PyObject *
12760PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012761{
12762 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012764 s = PyUnicode_FromObject(s);
12765 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012766 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 if (sep != NULL) {
12768 sep = PyUnicode_FromObject(sep);
12769 if (sep == NULL) {
12770 Py_DECREF(s);
12771 return NULL;
12772 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012773 }
12774
Victor Stinner9310abb2011-10-05 00:59:23 +020012775 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012776
12777 Py_DECREF(s);
12778 Py_XDECREF(sep);
12779 return result;
12780}
12781
12782PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012783 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012784\n\
12785Return a list of the words in S, using sep as the\n\
12786delimiter string, starting at the end of the string and\n\
12787working to the front. If maxsplit is given, at most maxsplit\n\
12788splits are done. If sep is not specified, any whitespace string\n\
12789is a separator.");
12790
12791static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012792unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012793{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012794 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012795 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012796 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012797
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012798 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12799 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012800 return NULL;
12801
12802 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012804 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012805 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012806 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012807 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012808}
12809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012810PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812\n\
12813Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012814Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012815is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
12817static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012818unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012820 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012821 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012823 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12824 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825 return NULL;
12826
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012827 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828}
12829
12830static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012831PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012833 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834}
12835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012836PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012837 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838\n\
12839Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012840and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841
12842static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012843unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012845 if (PyUnicode_READY(self) == -1)
12846 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012847 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848}
12849
Georg Brandlceee0772007-11-27 23:48:05 +000012850PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012852\n\
12853Return a translation table usable for str.translate().\n\
12854If there is only one argument, it must be a dictionary mapping Unicode\n\
12855ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012856Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012857If there are two arguments, they must be strings of equal length, and\n\
12858in the resulting dictionary, each character in x will be mapped to the\n\
12859character at the same position in y. If there is a third argument, it\n\
12860must be a string, whose characters will be mapped to None in the result.");
12861
12862static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012863unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012864{
12865 PyObject *x, *y = NULL, *z = NULL;
12866 PyObject *new = NULL, *key, *value;
12867 Py_ssize_t i = 0;
12868 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012869
Georg Brandlceee0772007-11-27 23:48:05 +000012870 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12871 return NULL;
12872 new = PyDict_New();
12873 if (!new)
12874 return NULL;
12875 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 int x_kind, y_kind, z_kind;
12877 void *x_data, *y_data, *z_data;
12878
Georg Brandlceee0772007-11-27 23:48:05 +000012879 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012880 if (!PyUnicode_Check(x)) {
12881 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12882 "be a string if there is a second argument");
12883 goto err;
12884 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012886 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12887 "arguments must have equal length");
12888 goto err;
12889 }
12890 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 x_kind = PyUnicode_KIND(x);
12892 y_kind = PyUnicode_KIND(y);
12893 x_data = PyUnicode_DATA(x);
12894 y_data = PyUnicode_DATA(y);
12895 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12896 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012897 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012898 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012899 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012900 if (!value) {
12901 Py_DECREF(key);
12902 goto err;
12903 }
Georg Brandlceee0772007-11-27 23:48:05 +000012904 res = PyDict_SetItem(new, key, value);
12905 Py_DECREF(key);
12906 Py_DECREF(value);
12907 if (res < 0)
12908 goto err;
12909 }
12910 /* create entries for deleting chars in z */
12911 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 z_kind = PyUnicode_KIND(z);
12913 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012914 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012916 if (!key)
12917 goto err;
12918 res = PyDict_SetItem(new, key, Py_None);
12919 Py_DECREF(key);
12920 if (res < 0)
12921 goto err;
12922 }
12923 }
12924 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 int kind;
12926 void *data;
12927
Georg Brandlceee0772007-11-27 23:48:05 +000012928 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012929 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012930 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12931 "to maketrans it must be a dict");
12932 goto err;
12933 }
12934 /* copy entries into the new dict, converting string keys to int keys */
12935 while (PyDict_Next(x, &i, &key, &value)) {
12936 if (PyUnicode_Check(key)) {
12937 /* convert string keys to integer keys */
12938 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012939 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012940 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12941 "table must be of length 1");
12942 goto err;
12943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 kind = PyUnicode_KIND(key);
12945 data = PyUnicode_DATA(key);
12946 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012947 if (!newkey)
12948 goto err;
12949 res = PyDict_SetItem(new, newkey, value);
12950 Py_DECREF(newkey);
12951 if (res < 0)
12952 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012953 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012954 /* just keep integer keys */
12955 if (PyDict_SetItem(new, key, value) < 0)
12956 goto err;
12957 } else {
12958 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12959 "be strings or integers");
12960 goto err;
12961 }
12962 }
12963 }
12964 return new;
12965 err:
12966 Py_DECREF(new);
12967 return NULL;
12968}
12969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012970PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012971 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012972\n\
12973Return a copy of the string S, where all characters have been mapped\n\
12974through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012975Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012976Unmapped characters are left untouched. Characters mapped to None\n\
12977are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012978
12979static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983}
12984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012985PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012988Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012989
12990static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012991unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012993 if (PyUnicode_READY(self) == -1)
12994 return NULL;
12995 if (PyUnicode_IS_ASCII(self))
12996 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012997 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012998}
12999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013000PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013003Pad a numeric string S with zeros on the left, to fill a field\n\
13004of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005
13006static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013007unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013009 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013010 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013011 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 int kind;
13013 void *data;
13014 Py_UCS4 chr;
13015
Martin v. Löwis18e16552006-02-15 17:27:45 +000013016 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017 return NULL;
13018
Benjamin Petersonbac79492012-01-14 13:34:47 -050013019 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013021
Victor Stinnerc4b49542011-12-11 22:44:26 +010013022 if (PyUnicode_GET_LENGTH(self) >= width)
13023 return unicode_result_unchanged(self);
13024
13025 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013026
13027 u = pad(self, fill, 0, '0');
13028
Walter Dörwald068325e2002-04-15 13:36:47 +000013029 if (u == NULL)
13030 return NULL;
13031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 kind = PyUnicode_KIND(u);
13033 data = PyUnicode_DATA(u);
13034 chr = PyUnicode_READ(kind, data, fill);
13035
13036 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013037 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 PyUnicode_WRITE(kind, data, 0, chr);
13039 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040 }
13041
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013042 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013043 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045
13046#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013047static PyObject *
13048unicode__decimal2ascii(PyObject *self)
13049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013050 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013051}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052#endif
13053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013054PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013057Return True if S starts with the specified prefix, False otherwise.\n\
13058With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013059With optional end, stop comparing S at that position.\n\
13060prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061
13062static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013063unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013064 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013066 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013067 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013068 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013069 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013070 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071
Jesus Ceaac451502011-04-20 17:09:23 +020013072 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013073 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013074 if (PyTuple_Check(subobj)) {
13075 Py_ssize_t i;
13076 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013077 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013078 if (substring == NULL)
13079 return NULL;
13080 result = tailmatch(self, substring, start, end, -1);
13081 Py_DECREF(substring);
13082 if (result) {
13083 Py_RETURN_TRUE;
13084 }
13085 }
13086 /* nothing matched */
13087 Py_RETURN_FALSE;
13088 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013089 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013090 if (substring == NULL) {
13091 if (PyErr_ExceptionMatches(PyExc_TypeError))
13092 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13093 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013094 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013095 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013096 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013098 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099}
13100
13101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013102PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013103 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013105Return True if S ends with the specified suffix, False otherwise.\n\
13106With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013107With optional end, stop comparing S at that position.\n\
13108suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109
13110static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013111unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013112 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013114 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013115 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013116 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013117 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013118 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119
Jesus Ceaac451502011-04-20 17:09:23 +020013120 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013121 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013122 if (PyTuple_Check(subobj)) {
13123 Py_ssize_t i;
13124 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013125 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013127 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013129 result = tailmatch(self, substring, start, end, +1);
13130 Py_DECREF(substring);
13131 if (result) {
13132 Py_RETURN_TRUE;
13133 }
13134 }
13135 Py_RETURN_FALSE;
13136 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013137 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013138 if (substring == NULL) {
13139 if (PyErr_ExceptionMatches(PyExc_TypeError))
13140 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13141 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013143 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013144 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013146 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147}
13148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013150
13151PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013153\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013154Return a formatted version of S, using substitutions from args and kwargs.\n\
13155The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013156
Eric Smith27bbca62010-11-04 17:06:58 +000013157PyDoc_STRVAR(format_map__doc__,
13158 "S.format_map(mapping) -> str\n\
13159\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013160Return a formatted version of S, using substitutions from mapping.\n\
13161The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013162
Eric Smith4a7d76d2008-05-30 18:10:19 +000013163static PyObject *
13164unicode__format__(PyObject* self, PyObject* args)
13165{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013166 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013167
13168 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13169 return NULL;
13170
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013171 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013173 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013174}
13175
Eric Smith8c663262007-08-25 02:26:07 +000013176PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013177 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013178\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013179Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013180
13181static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013182unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 Py_ssize_t size;
13185
13186 /* If it's a compact object, account for base structure +
13187 character data. */
13188 if (PyUnicode_IS_COMPACT_ASCII(v))
13189 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13190 else if (PyUnicode_IS_COMPACT(v))
13191 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013192 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013193 else {
13194 /* If it is a two-block object, account for base object, and
13195 for character block if present. */
13196 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013197 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013199 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 }
13201 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013202 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013203 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013204 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013205 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013206 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207
13208 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013209}
13210
13211PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013213
13214static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013215unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013216{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013217 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013218 if (!copy)
13219 return NULL;
13220 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013221}
13222
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013224 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013225 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013226 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13227 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013228 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13229 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013230 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013231 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13232 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13233 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13234 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13235 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013236 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013237 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13238 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13239 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013240 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013241 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13242 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13243 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013244 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013245 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013246 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013247 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013248 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13249 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13250 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13251 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13252 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13253 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13254 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13255 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13256 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13257 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13258 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13259 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13260 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13261 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013262 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013263 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013264 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013265 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013266 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013267 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013268 {"maketrans", (PyCFunction) unicode_maketrans,
13269 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013270 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013271#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013272 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013273 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274#endif
13275
Benjamin Peterson14339b62009-01-31 16:36:08 +000013276 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277 {NULL, NULL}
13278};
13279
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013280static PyObject *
13281unicode_mod(PyObject *v, PyObject *w)
13282{
Brian Curtindfc80e32011-08-10 20:28:54 -050013283 if (!PyUnicode_Check(v))
13284 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013285 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013286}
13287
13288static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013289 0, /*nb_add*/
13290 0, /*nb_subtract*/
13291 0, /*nb_multiply*/
13292 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013293};
13294
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013296 (lenfunc) unicode_length, /* sq_length */
13297 PyUnicode_Concat, /* sq_concat */
13298 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13299 (ssizeargfunc) unicode_getitem, /* sq_item */
13300 0, /* sq_slice */
13301 0, /* sq_ass_item */
13302 0, /* sq_ass_slice */
13303 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304};
13305
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013306static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013307unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 if (PyUnicode_READY(self) == -1)
13310 return NULL;
13311
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013312 if (PyIndex_Check(item)) {
13313 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013314 if (i == -1 && PyErr_Occurred())
13315 return NULL;
13316 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013317 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013318 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013319 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013320 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013321 PyObject *result;
13322 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013323 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013324 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013326 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013327 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013328 return NULL;
13329 }
13330
13331 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013332 Py_INCREF(unicode_empty);
13333 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013335 slicelength == PyUnicode_GET_LENGTH(self)) {
13336 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013337 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013338 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013339 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013340 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013341 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013342 src_kind = PyUnicode_KIND(self);
13343 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013344 if (!PyUnicode_IS_ASCII(self)) {
13345 kind_limit = kind_maxchar_limit(src_kind);
13346 max_char = 0;
13347 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13348 ch = PyUnicode_READ(src_kind, src_data, cur);
13349 if (ch > max_char) {
13350 max_char = ch;
13351 if (max_char >= kind_limit)
13352 break;
13353 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013354 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013355 }
Victor Stinner55c99112011-10-13 01:17:06 +020013356 else
13357 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013358 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013359 if (result == NULL)
13360 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013361 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013362 dest_data = PyUnicode_DATA(result);
13363
13364 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013365 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13366 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013367 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013368 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013369 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013370 } else {
13371 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13372 return NULL;
13373 }
13374}
13375
13376static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013377 (lenfunc)unicode_length, /* mp_length */
13378 (binaryfunc)unicode_subscript, /* mp_subscript */
13379 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013380};
13381
Guido van Rossumd57fd912000-03-10 22:53:23 +000013382
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383/* Helpers for PyUnicode_Format() */
13384
13385static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013386getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013388 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 (*p_argidx)++;
13391 if (arglen < 0)
13392 return args;
13393 else
13394 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395 }
13396 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398 return NULL;
13399}
13400
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013401/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013402
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013403static PyObject *
13404formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013405{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013406 char *p;
13407 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013408 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013409
Guido van Rossumd57fd912000-03-10 22:53:23 +000013410 x = PyFloat_AsDouble(v);
13411 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013412 return NULL;
13413
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013416
Eric Smith0923d1d2009-04-16 20:16:10 +000013417 p = PyOS_double_to_string(x, type, prec,
13418 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013419 if (p == NULL)
13420 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013422 PyMem_Free(p);
13423 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424}
13425
Tim Peters38fd5b62000-09-21 05:43:11 +000013426static PyObject*
13427formatlong(PyObject *val, int flags, int prec, int type)
13428{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013429 char *buf;
13430 int len;
13431 PyObject *str; /* temporary string object. */
13432 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013433
Benjamin Peterson14339b62009-01-31 16:36:08 +000013434 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13435 if (!str)
13436 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013437 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 Py_DECREF(str);
13439 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013440}
13441
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013442static Py_UCS4
13443formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013445 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013446 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013447 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013448 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 goto onError;
13451 }
13452 else {
13453 /* Integer input truncated to a character */
13454 long x;
13455 x = PyLong_AsLong(v);
13456 if (x == -1 && PyErr_Occurred())
13457 goto onError;
13458
Victor Stinner8faf8212011-12-08 22:14:11 +010013459 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 PyErr_SetString(PyExc_OverflowError,
13461 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013462 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013463 }
13464
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013465 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013466 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013467
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013469 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013471 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013472}
13473
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013474static int
13475repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13476{
13477 int r;
13478 assert(count > 0);
13479 assert(PyUnicode_Check(obj));
13480 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013481 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013482 if (repeated == NULL)
13483 return -1;
13484 r = _PyAccu_Accumulate(acc, repeated);
13485 Py_DECREF(repeated);
13486 return r;
13487 }
13488 else {
13489 do {
13490 if (_PyAccu_Accumulate(acc, obj))
13491 return -1;
13492 } while (--count);
13493 return 0;
13494 }
13495}
13496
Alexander Belopolsky40018472011-02-26 01:02:56 +000013497PyObject *
13498PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 void *fmt;
13501 int fmtkind;
13502 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013504 int r;
13505 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013506 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013507 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013508 PyObject *temp = NULL;
13509 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013510 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013511 _PyAccu acc;
13512 static PyObject *plus, *minus, *blank, *zero, *percent;
13513
13514 if (!plus && !(plus = get_latin1_char('+')))
13515 return NULL;
13516 if (!minus && !(minus = get_latin1_char('-')))
13517 return NULL;
13518 if (!blank && !(blank = get_latin1_char(' ')))
13519 return NULL;
13520 if (!zero && !(zero = get_latin1_char('0')))
13521 return NULL;
13522 if (!percent && !(percent = get_latin1_char('%')))
13523 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013524
Guido van Rossumd57fd912000-03-10 22:53:23 +000013525 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013526 PyErr_BadInternalCall();
13527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013528 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013529 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013530 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013532 if (PyUnicode_READY(uformat) == -1)
13533 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013534 if (_PyAccu_Init(&acc))
13535 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013536 fmt = PyUnicode_DATA(uformat);
13537 fmtkind = PyUnicode_KIND(uformat);
13538 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13539 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013540
Guido van Rossumd57fd912000-03-10 22:53:23 +000013541 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 arglen = PyTuple_Size(args);
13543 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013544 }
13545 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 arglen = -1;
13547 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013548 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013549 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013550 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013552
13553 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013554 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013555 PyObject *nonfmt;
13556 Py_ssize_t nonfmtpos;
13557 nonfmtpos = fmtpos++;
13558 while (fmtcnt >= 0 &&
13559 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13560 fmtpos++;
13561 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013562 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013563 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013564 if (nonfmt == NULL)
13565 goto onError;
13566 r = _PyAccu_Accumulate(&acc, nonfmt);
13567 Py_DECREF(nonfmt);
13568 if (r)
13569 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013570 }
13571 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 /* Got a format specifier */
13573 int flags = 0;
13574 Py_ssize_t width = -1;
13575 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013576 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013577 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013578 int isnumok;
13579 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013580 void *pbuf = NULL;
13581 Py_ssize_t pindex, len;
13582 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013584 fmtpos++;
13585 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13586 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 Py_ssize_t keylen;
13588 PyObject *key;
13589 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013590
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 if (dict == NULL) {
13592 PyErr_SetString(PyExc_TypeError,
13593 "format requires a mapping");
13594 goto onError;
13595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013598 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013599 /* Skip over balanced parentheses */
13600 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013602 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013603 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013605 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 if (fmtcnt < 0 || pcount > 0) {
13609 PyErr_SetString(PyExc_ValueError,
13610 "incomplete format key");
13611 goto onError;
13612 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013613 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013614 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 if (key == NULL)
13616 goto onError;
13617 if (args_owned) {
13618 Py_DECREF(args);
13619 args_owned = 0;
13620 }
13621 args = PyObject_GetItem(dict, key);
13622 Py_DECREF(key);
13623 if (args == NULL) {
13624 goto onError;
13625 }
13626 args_owned = 1;
13627 arglen = -1;
13628 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013629 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013630 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013632 case '-': flags |= F_LJUST; continue;
13633 case '+': flags |= F_SIGN; continue;
13634 case ' ': flags |= F_BLANK; continue;
13635 case '#': flags |= F_ALT; continue;
13636 case '0': flags |= F_ZERO; continue;
13637 }
13638 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013639 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013640 if (c == '*') {
13641 v = getnextarg(args, arglen, &argidx);
13642 if (v == NULL)
13643 goto onError;
13644 if (!PyLong_Check(v)) {
13645 PyErr_SetString(PyExc_TypeError,
13646 "* wants int");
13647 goto onError;
13648 }
13649 width = PyLong_AsLong(v);
13650 if (width == -1 && PyErr_Occurred())
13651 goto onError;
13652 if (width < 0) {
13653 flags |= F_LJUST;
13654 width = -width;
13655 }
13656 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013657 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013658 }
13659 else if (c >= '0' && c <= '9') {
13660 width = c - '0';
13661 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013662 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 if (c < '0' || c > '9')
13664 break;
13665 if ((width*10) / 10 != width) {
13666 PyErr_SetString(PyExc_ValueError,
13667 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013668 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013669 }
13670 width = width*10 + (c - '0');
13671 }
13672 }
13673 if (c == '.') {
13674 prec = 0;
13675 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013676 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013677 if (c == '*') {
13678 v = getnextarg(args, arglen, &argidx);
13679 if (v == NULL)
13680 goto onError;
13681 if (!PyLong_Check(v)) {
13682 PyErr_SetString(PyExc_TypeError,
13683 "* wants int");
13684 goto onError;
13685 }
13686 prec = PyLong_AsLong(v);
13687 if (prec == -1 && PyErr_Occurred())
13688 goto onError;
13689 if (prec < 0)
13690 prec = 0;
13691 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013692 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013693 }
13694 else if (c >= '0' && c <= '9') {
13695 prec = c - '0';
13696 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013697 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013698 if (c < '0' || c > '9')
13699 break;
13700 if ((prec*10) / 10 != prec) {
13701 PyErr_SetString(PyExc_ValueError,
13702 "prec too big");
13703 goto onError;
13704 }
13705 prec = prec*10 + (c - '0');
13706 }
13707 }
13708 } /* prec */
13709 if (fmtcnt >= 0) {
13710 if (c == 'h' || c == 'l' || c == 'L') {
13711 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013712 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 }
13714 }
13715 if (fmtcnt < 0) {
13716 PyErr_SetString(PyExc_ValueError,
13717 "incomplete format");
13718 goto onError;
13719 }
13720 if (c != '%') {
13721 v = getnextarg(args, arglen, &argidx);
13722 if (v == NULL)
13723 goto onError;
13724 }
13725 sign = 0;
13726 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013727 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013728 switch (c) {
13729
13730 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013731 _PyAccu_Accumulate(&acc, percent);
13732 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013733
13734 case 's':
13735 case 'r':
13736 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013737 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 temp = v;
13739 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013740 }
13741 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013742 if (c == 's')
13743 temp = PyObject_Str(v);
13744 else if (c == 'r')
13745 temp = PyObject_Repr(v);
13746 else
13747 temp = PyObject_ASCII(v);
13748 if (temp == NULL)
13749 goto onError;
13750 if (PyUnicode_Check(temp))
13751 /* nothing to do */;
13752 else {
13753 Py_DECREF(temp);
13754 PyErr_SetString(PyExc_TypeError,
13755 "%s argument has non-string str()");
13756 goto onError;
13757 }
13758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013759 if (PyUnicode_READY(temp) == -1) {
13760 Py_CLEAR(temp);
13761 goto onError;
13762 }
13763 pbuf = PyUnicode_DATA(temp);
13764 kind = PyUnicode_KIND(temp);
13765 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013766 if (prec >= 0 && len > prec)
13767 len = prec;
13768 break;
13769
13770 case 'i':
13771 case 'd':
13772 case 'u':
13773 case 'o':
13774 case 'x':
13775 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013776 isnumok = 0;
13777 if (PyNumber_Check(v)) {
13778 PyObject *iobj=NULL;
13779
13780 if (PyLong_Check(v)) {
13781 iobj = v;
13782 Py_INCREF(iobj);
13783 }
13784 else {
13785 iobj = PyNumber_Long(v);
13786 }
13787 if (iobj!=NULL) {
13788 if (PyLong_Check(iobj)) {
13789 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013790 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013791 Py_DECREF(iobj);
13792 if (!temp)
13793 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013794 if (PyUnicode_READY(temp) == -1) {
13795 Py_CLEAR(temp);
13796 goto onError;
13797 }
13798 pbuf = PyUnicode_DATA(temp);
13799 kind = PyUnicode_KIND(temp);
13800 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013801 sign = 1;
13802 }
13803 else {
13804 Py_DECREF(iobj);
13805 }
13806 }
13807 }
13808 if (!isnumok) {
13809 PyErr_Format(PyExc_TypeError,
13810 "%%%c format: a number is required, "
13811 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13812 goto onError;
13813 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013814 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013815 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013816 fillobj = zero;
13817 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013818 break;
13819
13820 case 'e':
13821 case 'E':
13822 case 'f':
13823 case 'F':
13824 case 'g':
13825 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013826 temp = formatfloat(v, flags, prec, c);
13827 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013828 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013829 if (PyUnicode_READY(temp) == -1) {
13830 Py_CLEAR(temp);
13831 goto onError;
13832 }
13833 pbuf = PyUnicode_DATA(temp);
13834 kind = PyUnicode_KIND(temp);
13835 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013836 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013837 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013838 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013839 fillobj = zero;
13840 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013841 break;
13842
13843 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013844 {
13845 Py_UCS4 ch = formatchar(v);
13846 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013847 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013848 temp = _PyUnicode_FromUCS4(&ch, 1);
13849 if (temp == NULL)
13850 goto onError;
13851 pbuf = PyUnicode_DATA(temp);
13852 kind = PyUnicode_KIND(temp);
13853 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013854 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013855 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013856
13857 default:
13858 PyErr_Format(PyExc_ValueError,
13859 "unsupported format character '%c' (0x%x) "
13860 "at index %zd",
13861 (31<=c && c<=126) ? (char)c : '?',
13862 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013863 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013864 goto onError;
13865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013866 /* pbuf is initialized here. */
13867 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013868 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013869 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13870 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013872 pindex++;
13873 }
13874 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13875 signobj = plus;
13876 len--;
13877 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013878 }
13879 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013880 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013881 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013882 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013883 else
13884 sign = 0;
13885 }
13886 if (width < len)
13887 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013889 if (fill != ' ') {
13890 assert(signobj != NULL);
13891 if (_PyAccu_Accumulate(&acc, signobj))
13892 goto onError;
13893 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013894 if (width > len)
13895 width--;
13896 }
13897 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013898 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013899 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013900 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013901 second = get_latin1_char(
13902 PyUnicode_READ(kind, pbuf, pindex + 1));
13903 pindex += 2;
13904 if (second == NULL ||
13905 _PyAccu_Accumulate(&acc, zero) ||
13906 _PyAccu_Accumulate(&acc, second))
13907 goto onError;
13908 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013909 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013910 width -= 2;
13911 if (width < 0)
13912 width = 0;
13913 len -= 2;
13914 }
13915 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013916 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013917 if (repeat_accumulate(&acc, fillobj, width - len))
13918 goto onError;
13919 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013920 }
13921 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013922 if (sign) {
13923 assert(signobj != NULL);
13924 if (_PyAccu_Accumulate(&acc, signobj))
13925 goto onError;
13926 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013927 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013928 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13929 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013930 second = get_latin1_char(
13931 PyUnicode_READ(kind, pbuf, pindex + 1));
13932 pindex += 2;
13933 if (second == NULL ||
13934 _PyAccu_Accumulate(&acc, zero) ||
13935 _PyAccu_Accumulate(&acc, second))
13936 goto onError;
13937 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013938 }
13939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013940 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013941 if (temp != NULL) {
13942 assert(pbuf == PyUnicode_DATA(temp));
13943 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013944 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013945 else {
13946 const char *p = (const char *) pbuf;
13947 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013948 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013949 v = PyUnicode_FromKindAndData(kind, p, len);
13950 }
13951 if (v == NULL)
13952 goto onError;
13953 r = _PyAccu_Accumulate(&acc, v);
13954 Py_DECREF(v);
13955 if (r)
13956 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013957 if (width > len && repeat_accumulate(&acc, blank, width - len))
13958 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013959 if (dict && (argidx < arglen) && c != '%') {
13960 PyErr_SetString(PyExc_TypeError,
13961 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013962 goto onError;
13963 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013964 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013965 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013966 } /* until end */
13967 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013968 PyErr_SetString(PyExc_TypeError,
13969 "not all arguments converted during string formatting");
13970 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013971 }
13972
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013973 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013974 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013975 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013976 }
13977 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013978 Py_XDECREF(temp);
13979 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013980 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013981
Benjamin Peterson29060642009-01-31 22:14:21 +000013982 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013983 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013984 Py_XDECREF(temp);
13985 Py_XDECREF(second);
13986 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013987 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013988 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013989 }
13990 return NULL;
13991}
13992
Jeremy Hylton938ace62002-07-17 16:30:39 +000013993static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013994unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13995
Tim Peters6d6c1a32001-08-02 04:15:00 +000013996static PyObject *
13997unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13998{
Benjamin Peterson29060642009-01-31 22:14:21 +000013999 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014000 static char *kwlist[] = {"object", "encoding", "errors", 0};
14001 char *encoding = NULL;
14002 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014003
Benjamin Peterson14339b62009-01-31 16:36:08 +000014004 if (type != &PyUnicode_Type)
14005 return unicode_subtype_new(type, args, kwds);
14006 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014007 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014008 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014009 if (x == NULL) {
14010 Py_INCREF(unicode_empty);
14011 return unicode_empty;
14012 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014013 if (encoding == NULL && errors == NULL)
14014 return PyObject_Str(x);
14015 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014016 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014017}
14018
Guido van Rossume023fe02001-08-30 03:12:59 +000014019static PyObject *
14020unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14021{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014022 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014023 Py_ssize_t length, char_size;
14024 int share_wstr, share_utf8;
14025 unsigned int kind;
14026 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014027
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014029
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014030 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014031 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014033 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014034 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014035 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014036 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014037 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014038
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014039 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014040 if (self == NULL) {
14041 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 return NULL;
14043 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014044 kind = PyUnicode_KIND(unicode);
14045 length = PyUnicode_GET_LENGTH(unicode);
14046
14047 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014048#ifdef Py_DEBUG
14049 _PyUnicode_HASH(self) = -1;
14050#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014051 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014052#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014053 _PyUnicode_STATE(self).interned = 0;
14054 _PyUnicode_STATE(self).kind = kind;
14055 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014056 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014057 _PyUnicode_STATE(self).ready = 1;
14058 _PyUnicode_WSTR(self) = NULL;
14059 _PyUnicode_UTF8_LENGTH(self) = 0;
14060 _PyUnicode_UTF8(self) = NULL;
14061 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014062 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014063
14064 share_utf8 = 0;
14065 share_wstr = 0;
14066 if (kind == PyUnicode_1BYTE_KIND) {
14067 char_size = 1;
14068 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14069 share_utf8 = 1;
14070 }
14071 else if (kind == PyUnicode_2BYTE_KIND) {
14072 char_size = 2;
14073 if (sizeof(wchar_t) == 2)
14074 share_wstr = 1;
14075 }
14076 else {
14077 assert(kind == PyUnicode_4BYTE_KIND);
14078 char_size = 4;
14079 if (sizeof(wchar_t) == 4)
14080 share_wstr = 1;
14081 }
14082
14083 /* Ensure we won't overflow the length. */
14084 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14085 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014086 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014087 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014088 data = PyObject_MALLOC((length + 1) * char_size);
14089 if (data == NULL) {
14090 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014091 goto onError;
14092 }
14093
Victor Stinnerc3c74152011-10-02 20:39:55 +020014094 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014095 if (share_utf8) {
14096 _PyUnicode_UTF8_LENGTH(self) = length;
14097 _PyUnicode_UTF8(self) = data;
14098 }
14099 if (share_wstr) {
14100 _PyUnicode_WSTR_LENGTH(self) = length;
14101 _PyUnicode_WSTR(self) = (wchar_t *)data;
14102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014103
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014104 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014105 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014106 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014107#ifdef Py_DEBUG
14108 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14109#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014110 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014111 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014112
14113onError:
14114 Py_DECREF(unicode);
14115 Py_DECREF(self);
14116 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014117}
14118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014119PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014120 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014121\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014122Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014123encoding defaults to the current default string encoding.\n\
14124errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014125
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014126static PyObject *unicode_iter(PyObject *seq);
14127
Guido van Rossumd57fd912000-03-10 22:53:23 +000014128PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014129 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014130 "str", /* tp_name */
14131 sizeof(PyUnicodeObject), /* tp_size */
14132 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014133 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014134 (destructor)unicode_dealloc, /* tp_dealloc */
14135 0, /* tp_print */
14136 0, /* tp_getattr */
14137 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014138 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014139 unicode_repr, /* tp_repr */
14140 &unicode_as_number, /* tp_as_number */
14141 &unicode_as_sequence, /* tp_as_sequence */
14142 &unicode_as_mapping, /* tp_as_mapping */
14143 (hashfunc) unicode_hash, /* tp_hash*/
14144 0, /* tp_call*/
14145 (reprfunc) unicode_str, /* tp_str */
14146 PyObject_GenericGetAttr, /* tp_getattro */
14147 0, /* tp_setattro */
14148 0, /* tp_as_buffer */
14149 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014150 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014151 unicode_doc, /* tp_doc */
14152 0, /* tp_traverse */
14153 0, /* tp_clear */
14154 PyUnicode_RichCompare, /* tp_richcompare */
14155 0, /* tp_weaklistoffset */
14156 unicode_iter, /* tp_iter */
14157 0, /* tp_iternext */
14158 unicode_methods, /* tp_methods */
14159 0, /* tp_members */
14160 0, /* tp_getset */
14161 &PyBaseObject_Type, /* tp_base */
14162 0, /* tp_dict */
14163 0, /* tp_descr_get */
14164 0, /* tp_descr_set */
14165 0, /* tp_dictoffset */
14166 0, /* tp_init */
14167 0, /* tp_alloc */
14168 unicode_new, /* tp_new */
14169 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014170};
14171
14172/* Initialize the Unicode implementation */
14173
Victor Stinner3a50e702011-10-18 21:21:00 +020014174int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014175{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014176 int i;
14177
Thomas Wouters477c8d52006-05-27 19:21:47 +000014178 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014179 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014180 0x000A, /* LINE FEED */
14181 0x000D, /* CARRIAGE RETURN */
14182 0x001C, /* FILE SEPARATOR */
14183 0x001D, /* GROUP SEPARATOR */
14184 0x001E, /* RECORD SEPARATOR */
14185 0x0085, /* NEXT LINE */
14186 0x2028, /* LINE SEPARATOR */
14187 0x2029, /* PARAGRAPH SEPARATOR */
14188 };
14189
Fred Drakee4315f52000-05-09 19:53:39 +000014190 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014191 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014192 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014193 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014194 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014195
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014196 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014197 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014198 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014199 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014200
14201 /* initialize the linebreak bloom filter */
14202 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014203 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014204 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014205
14206 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014207
14208#ifdef HAVE_MBCS
14209 winver.dwOSVersionInfoSize = sizeof(winver);
14210 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14211 PyErr_SetFromWindowsErr(0);
14212 return -1;
14213 }
14214#endif
14215 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014216}
14217
14218/* Finalize the Unicode implementation */
14219
Christian Heimesa156e092008-02-16 07:38:31 +000014220int
14221PyUnicode_ClearFreeList(void)
14222{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014223 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014224}
14225
Guido van Rossumd57fd912000-03-10 22:53:23 +000014226void
Thomas Wouters78890102000-07-22 19:25:51 +000014227_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014228{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014229 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014230
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014231 Py_XDECREF(unicode_empty);
14232 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014233
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014234 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014235 if (unicode_latin1[i]) {
14236 Py_DECREF(unicode_latin1[i]);
14237 unicode_latin1[i] = NULL;
14238 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014239 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014240 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014241 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014242}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014243
Walter Dörwald16807132007-05-25 13:52:07 +000014244void
14245PyUnicode_InternInPlace(PyObject **p)
14246{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014247 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014248 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014249#ifdef Py_DEBUG
14250 assert(s != NULL);
14251 assert(_PyUnicode_CHECK(s));
14252#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014253 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014254 return;
14255#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014256 /* If it's a subclass, we don't really know what putting
14257 it in the interned dict might do. */
14258 if (!PyUnicode_CheckExact(s))
14259 return;
14260 if (PyUnicode_CHECK_INTERNED(s))
14261 return;
14262 if (interned == NULL) {
14263 interned = PyDict_New();
14264 if (interned == NULL) {
14265 PyErr_Clear(); /* Don't leave an exception */
14266 return;
14267 }
14268 }
14269 /* It might be that the GetItem call fails even
14270 though the key is present in the dictionary,
14271 namely when this happens during a stack overflow. */
14272 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014273 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014274 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014275
Benjamin Peterson29060642009-01-31 22:14:21 +000014276 if (t) {
14277 Py_INCREF(t);
14278 Py_DECREF(*p);
14279 *p = t;
14280 return;
14281 }
Walter Dörwald16807132007-05-25 13:52:07 +000014282
Benjamin Peterson14339b62009-01-31 16:36:08 +000014283 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014284 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014285 PyErr_Clear();
14286 PyThreadState_GET()->recursion_critical = 0;
14287 return;
14288 }
14289 PyThreadState_GET()->recursion_critical = 0;
14290 /* The two references in interned are not counted by refcnt.
14291 The deallocator will take care of this */
14292 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014293 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014294}
14295
14296void
14297PyUnicode_InternImmortal(PyObject **p)
14298{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014299 PyUnicode_InternInPlace(p);
14300 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014301 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014302 Py_INCREF(*p);
14303 }
Walter Dörwald16807132007-05-25 13:52:07 +000014304}
14305
14306PyObject *
14307PyUnicode_InternFromString(const char *cp)
14308{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014309 PyObject *s = PyUnicode_FromString(cp);
14310 if (s == NULL)
14311 return NULL;
14312 PyUnicode_InternInPlace(&s);
14313 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014314}
14315
Alexander Belopolsky40018472011-02-26 01:02:56 +000014316void
14317_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014318{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014319 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014320 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014321 Py_ssize_t i, n;
14322 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014323
Benjamin Peterson14339b62009-01-31 16:36:08 +000014324 if (interned == NULL || !PyDict_Check(interned))
14325 return;
14326 keys = PyDict_Keys(interned);
14327 if (keys == NULL || !PyList_Check(keys)) {
14328 PyErr_Clear();
14329 return;
14330 }
Walter Dörwald16807132007-05-25 13:52:07 +000014331
Benjamin Peterson14339b62009-01-31 16:36:08 +000014332 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14333 detector, interned unicode strings are not forcibly deallocated;
14334 rather, we give them their stolen references back, and then clear
14335 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014336
Benjamin Peterson14339b62009-01-31 16:36:08 +000014337 n = PyList_GET_SIZE(keys);
14338 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014339 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014340 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014341 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014342 if (PyUnicode_READY(s) == -1) {
14343 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014344 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014346 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014347 case SSTATE_NOT_INTERNED:
14348 /* XXX Shouldn't happen */
14349 break;
14350 case SSTATE_INTERNED_IMMORTAL:
14351 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014352 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014353 break;
14354 case SSTATE_INTERNED_MORTAL:
14355 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014356 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 break;
14358 default:
14359 Py_FatalError("Inconsistent interned string state.");
14360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014361 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014362 }
14363 fprintf(stderr, "total size of all interned strings: "
14364 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14365 "mortal/immortal\n", mortal_size, immortal_size);
14366 Py_DECREF(keys);
14367 PyDict_Clear(interned);
14368 Py_DECREF(interned);
14369 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014370}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014371
14372
14373/********************* Unicode Iterator **************************/
14374
14375typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014376 PyObject_HEAD
14377 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014378 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014379} unicodeiterobject;
14380
14381static void
14382unicodeiter_dealloc(unicodeiterobject *it)
14383{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014384 _PyObject_GC_UNTRACK(it);
14385 Py_XDECREF(it->it_seq);
14386 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014387}
14388
14389static int
14390unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14391{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014392 Py_VISIT(it->it_seq);
14393 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014394}
14395
14396static PyObject *
14397unicodeiter_next(unicodeiterobject *it)
14398{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014399 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014400
Benjamin Peterson14339b62009-01-31 16:36:08 +000014401 assert(it != NULL);
14402 seq = it->it_seq;
14403 if (seq == NULL)
14404 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014405 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014407 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14408 int kind = PyUnicode_KIND(seq);
14409 void *data = PyUnicode_DATA(seq);
14410 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14411 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014412 if (item != NULL)
14413 ++it->it_index;
14414 return item;
14415 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014416
Benjamin Peterson14339b62009-01-31 16:36:08 +000014417 Py_DECREF(seq);
14418 it->it_seq = NULL;
14419 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014420}
14421
14422static PyObject *
14423unicodeiter_len(unicodeiterobject *it)
14424{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014425 Py_ssize_t len = 0;
14426 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014427 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014428 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014429}
14430
14431PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14432
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014433static PyObject *
14434unicodeiter_reduce(unicodeiterobject *it)
14435{
14436 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014437 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014438 it->it_seq, it->it_index);
14439 } else {
14440 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14441 if (u == NULL)
14442 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014443 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014444 }
14445}
14446
14447PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14448
14449static PyObject *
14450unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14451{
14452 Py_ssize_t index = PyLong_AsSsize_t(state);
14453 if (index == -1 && PyErr_Occurred())
14454 return NULL;
14455 if (index < 0)
14456 index = 0;
14457 it->it_index = index;
14458 Py_RETURN_NONE;
14459}
14460
14461PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14462
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014463static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014464 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014465 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014466 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14467 reduce_doc},
14468 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14469 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014470 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014471};
14472
14473PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014474 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14475 "str_iterator", /* tp_name */
14476 sizeof(unicodeiterobject), /* tp_basicsize */
14477 0, /* tp_itemsize */
14478 /* methods */
14479 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14480 0, /* tp_print */
14481 0, /* tp_getattr */
14482 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014483 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014484 0, /* tp_repr */
14485 0, /* tp_as_number */
14486 0, /* tp_as_sequence */
14487 0, /* tp_as_mapping */
14488 0, /* tp_hash */
14489 0, /* tp_call */
14490 0, /* tp_str */
14491 PyObject_GenericGetAttr, /* tp_getattro */
14492 0, /* tp_setattro */
14493 0, /* tp_as_buffer */
14494 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14495 0, /* tp_doc */
14496 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14497 0, /* tp_clear */
14498 0, /* tp_richcompare */
14499 0, /* tp_weaklistoffset */
14500 PyObject_SelfIter, /* tp_iter */
14501 (iternextfunc)unicodeiter_next, /* tp_iternext */
14502 unicodeiter_methods, /* tp_methods */
14503 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014504};
14505
14506static PyObject *
14507unicode_iter(PyObject *seq)
14508{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014509 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014510
Benjamin Peterson14339b62009-01-31 16:36:08 +000014511 if (!PyUnicode_Check(seq)) {
14512 PyErr_BadInternalCall();
14513 return NULL;
14514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014515 if (PyUnicode_READY(seq) == -1)
14516 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014517 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14518 if (it == NULL)
14519 return NULL;
14520 it->it_index = 0;
14521 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014522 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014523 _PyObject_GC_TRACK(it);
14524 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014525}
14526
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014527
14528size_t
14529Py_UNICODE_strlen(const Py_UNICODE *u)
14530{
14531 int res = 0;
14532 while(*u++)
14533 res++;
14534 return res;
14535}
14536
14537Py_UNICODE*
14538Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14539{
14540 Py_UNICODE *u = s1;
14541 while ((*u++ = *s2++));
14542 return s1;
14543}
14544
14545Py_UNICODE*
14546Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14547{
14548 Py_UNICODE *u = s1;
14549 while ((*u++ = *s2++))
14550 if (n-- == 0)
14551 break;
14552 return s1;
14553}
14554
14555Py_UNICODE*
14556Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14557{
14558 Py_UNICODE *u1 = s1;
14559 u1 += Py_UNICODE_strlen(u1);
14560 Py_UNICODE_strcpy(u1, s2);
14561 return s1;
14562}
14563
14564int
14565Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14566{
14567 while (*s1 && *s2 && *s1 == *s2)
14568 s1++, s2++;
14569 if (*s1 && *s2)
14570 return (*s1 < *s2) ? -1 : +1;
14571 if (*s1)
14572 return 1;
14573 if (*s2)
14574 return -1;
14575 return 0;
14576}
14577
14578int
14579Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14580{
14581 register Py_UNICODE u1, u2;
14582 for (; n != 0; n--) {
14583 u1 = *s1;
14584 u2 = *s2;
14585 if (u1 != u2)
14586 return (u1 < u2) ? -1 : +1;
14587 if (u1 == '\0')
14588 return 0;
14589 s1++;
14590 s2++;
14591 }
14592 return 0;
14593}
14594
14595Py_UNICODE*
14596Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14597{
14598 const Py_UNICODE *p;
14599 for (p = s; *p; p++)
14600 if (*p == c)
14601 return (Py_UNICODE*)p;
14602 return NULL;
14603}
14604
14605Py_UNICODE*
14606Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14607{
14608 const Py_UNICODE *p;
14609 p = s + Py_UNICODE_strlen(s);
14610 while (p != s) {
14611 p--;
14612 if (*p == c)
14613 return (Py_UNICODE*)p;
14614 }
14615 return NULL;
14616}
Victor Stinner331ea922010-08-10 16:37:20 +000014617
Victor Stinner71133ff2010-09-01 23:43:53 +000014618Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014619PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014620{
Victor Stinner577db2c2011-10-11 22:12:48 +020014621 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014622 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014624 if (!PyUnicode_Check(unicode)) {
14625 PyErr_BadArgument();
14626 return NULL;
14627 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014628 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014629 if (u == NULL)
14630 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014631 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014632 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014633 PyErr_NoMemory();
14634 return NULL;
14635 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014636 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014637 size *= sizeof(Py_UNICODE);
14638 copy = PyMem_Malloc(size);
14639 if (copy == NULL) {
14640 PyErr_NoMemory();
14641 return NULL;
14642 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014643 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014644 return copy;
14645}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014646
Georg Brandl66c221e2010-10-14 07:04:07 +000014647/* A _string module, to export formatter_parser and formatter_field_name_split
14648 to the string.Formatter class implemented in Python. */
14649
14650static PyMethodDef _string_methods[] = {
14651 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14652 METH_O, PyDoc_STR("split the argument as a field name")},
14653 {"formatter_parser", (PyCFunction) formatter_parser,
14654 METH_O, PyDoc_STR("parse the argument as a format string")},
14655 {NULL, NULL}
14656};
14657
14658static struct PyModuleDef _string_module = {
14659 PyModuleDef_HEAD_INIT,
14660 "_string",
14661 PyDoc_STR("string helper module"),
14662 0,
14663 _string_methods,
14664 NULL,
14665 NULL,
14666 NULL,
14667 NULL
14668};
14669
14670PyMODINIT_FUNC
14671PyInit__string(void)
14672{
14673 return PyModule_Create(&_string_module);
14674}
14675
14676
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014677#ifdef __cplusplus
14678}
14679#endif