blob: 7e73bc226c16862d46164bafe93b53ab857d80ec [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001001 if (maxchar > MAX_UNICODE) {
1002 PyErr_SetString(PyExc_SystemError,
1003 "invalid maximum character passed to PyUnicode_New");
1004 return NULL;
1005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 kind_state = PyUnicode_4BYTE_KIND;
1007 char_size = 4;
1008 if (sizeof(wchar_t) == 4)
1009 is_sharing = 1;
1010 }
1011
1012 /* Ensure we won't overflow the size. */
1013 if (size < 0) {
1014 PyErr_SetString(PyExc_SystemError,
1015 "Negative size passed to PyUnicode_New");
1016 return NULL;
1017 }
1018 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1019 return PyErr_NoMemory();
1020
1021 /* Duplicated allocation code from _PyObject_New() instead of a call to
1022 * PyObject_New() so we are able to allocate space for the object and
1023 * it's data buffer.
1024 */
1025 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1026 if (obj == NULL)
1027 return PyErr_NoMemory();
1028 obj = PyObject_INIT(obj, &PyUnicode_Type);
1029 if (obj == NULL)
1030 return NULL;
1031
1032 unicode = (PyCompactUnicodeObject *)obj;
1033 if (is_ascii)
1034 data = ((PyASCIIObject*)obj) + 1;
1035 else
1036 data = unicode + 1;
1037 _PyUnicode_LENGTH(unicode) = size;
1038 _PyUnicode_HASH(unicode) = -1;
1039 _PyUnicode_STATE(unicode).interned = 0;
1040 _PyUnicode_STATE(unicode).kind = kind_state;
1041 _PyUnicode_STATE(unicode).compact = 1;
1042 _PyUnicode_STATE(unicode).ready = 1;
1043 _PyUnicode_STATE(unicode).ascii = is_ascii;
1044 if (is_ascii) {
1045 ((char*)data)[size] = 0;
1046 _PyUnicode_WSTR(unicode) = NULL;
1047 }
1048 else if (kind_state == PyUnicode_1BYTE_KIND) {
1049 ((char*)data)[size] = 0;
1050 _PyUnicode_WSTR(unicode) = NULL;
1051 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001053 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 }
1055 else {
1056 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001057 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 if (kind_state == PyUnicode_2BYTE_KIND)
1059 ((Py_UCS2*)data)[size] = 0;
1060 else /* kind_state == PyUnicode_4BYTE_KIND */
1061 ((Py_UCS4*)data)[size] = 0;
1062 if (is_sharing) {
1063 _PyUnicode_WSTR_LENGTH(unicode) = size;
1064 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1065 }
1066 else {
1067 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1068 _PyUnicode_WSTR(unicode) = NULL;
1069 }
1070 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001071 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 return obj;
1073}
1074
1075#if SIZEOF_WCHAR_T == 2
1076/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1077 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001078 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079
1080 This function assumes that unicode can hold one more code point than wstr
1081 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001082static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001084 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085{
1086 const wchar_t *iter;
1087 Py_UCS4 *ucs4_out;
1088
Victor Stinner910337b2011-10-03 03:20:16 +02001089 assert(unicode != NULL);
1090 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1092 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1093
1094 for (iter = begin; iter < end; ) {
1095 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1096 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001097 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1098 && (iter+1) < end
1099 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100 {
Victor Stinner551ac952011-11-29 22:58:13 +01001101 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 iter += 2;
1103 }
1104 else {
1105 *ucs4_out++ = *iter;
1106 iter++;
1107 }
1108 }
1109 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1110 _PyUnicode_GET_LENGTH(unicode)));
1111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112}
1113#endif
1114
Victor Stinnercd9950f2011-10-02 00:34:53 +02001115static int
Victor Stinner488fa492011-12-12 00:01:39 +01001116unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001117{
Victor Stinner488fa492011-12-12 00:01:39 +01001118 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001119 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001120 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001121 return -1;
1122 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001123 return 0;
1124}
1125
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001126static int
1127_copy_characters(PyObject *to, Py_ssize_t to_start,
1128 PyObject *from, Py_ssize_t from_start,
1129 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001131 unsigned int from_kind, to_kind;
1132 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001133 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_Check(from));
1136 assert(PyUnicode_Check(to));
1137 assert(PyUnicode_IS_READY(from));
1138 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001140 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1141 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1142 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001144 if (how_many == 0)
1145 return 0;
1146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001148 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001150 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152#ifdef Py_DEBUG
1153 if (!check_maxchar
1154 && (from_kind > to_kind
1155 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001157 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1158 Py_UCS4 ch;
1159 Py_ssize_t i;
1160 for (i=0; i < how_many; i++) {
1161 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1162 assert(ch <= to_maxchar);
1163 }
1164 }
1165#endif
1166 fast = (from_kind == to_kind);
1167 if (check_maxchar
1168 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1169 {
1170 /* deny latin1 => ascii */
1171 fast = 0;
1172 }
1173
1174 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001175 Py_MEMCPY((char*)to_data + to_kind * to_start,
1176 (char*)from_data + from_kind * from_start,
1177 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001179 else if (from_kind == PyUnicode_1BYTE_KIND
1180 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001181 {
1182 _PyUnicode_CONVERT_BYTES(
1183 Py_UCS1, Py_UCS2,
1184 PyUnicode_1BYTE_DATA(from) + from_start,
1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186 PyUnicode_2BYTE_DATA(to) + to_start
1187 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001188 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001189 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001190 && to_kind == PyUnicode_4BYTE_KIND)
1191 {
1192 _PyUnicode_CONVERT_BYTES(
1193 Py_UCS1, Py_UCS4,
1194 PyUnicode_1BYTE_DATA(from) + from_start,
1195 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1196 PyUnicode_4BYTE_DATA(to) + to_start
1197 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001198 }
1199 else if (from_kind == PyUnicode_2BYTE_KIND
1200 && to_kind == PyUnicode_4BYTE_KIND)
1201 {
1202 _PyUnicode_CONVERT_BYTES(
1203 Py_UCS2, Py_UCS4,
1204 PyUnicode_2BYTE_DATA(from) + from_start,
1205 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1206 PyUnicode_4BYTE_DATA(to) + to_start
1207 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001208 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001210 /* check if max_char(from substring) <= max_char(to) */
1211 if (from_kind > to_kind
1212 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001213 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001215 /* slow path to check for character overflow */
1216 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001217 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 Py_ssize_t i;
1219
Victor Stinner56c161a2011-10-06 02:47:11 +02001220#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001221 for (i=0; i < how_many; i++) {
1222 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001223 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001224 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1225 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001226#else
1227 if (!check_maxchar) {
1228 for (i=0; i < how_many; i++) {
1229 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1230 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1231 }
1232 }
1233 else {
1234 for (i=0; i < how_many; i++) {
1235 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1236 if (ch > to_maxchar)
1237 return 1;
1238 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1239 }
1240 }
1241#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001244 assert(0 && "inconsistent state");
1245 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001246 }
1247 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001248 return 0;
1249}
1250
1251static void
1252copy_characters(PyObject *to, Py_ssize_t to_start,
1253 PyObject *from, Py_ssize_t from_start,
1254 Py_ssize_t how_many)
1255{
1256 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1257}
1258
1259Py_ssize_t
1260PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1261 PyObject *from, Py_ssize_t from_start,
1262 Py_ssize_t how_many)
1263{
1264 int err;
1265
1266 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1267 PyErr_BadInternalCall();
1268 return -1;
1269 }
1270
Benjamin Petersonbac79492012-01-14 13:34:47 -05001271 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001272 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001273 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001274 return -1;
1275
1276 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1277 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1278 PyErr_Format(PyExc_SystemError,
1279 "Cannot write %zi characters at %zi "
1280 "in a string of %zi characters",
1281 how_many, to_start, PyUnicode_GET_LENGTH(to));
1282 return -1;
1283 }
1284
1285 if (how_many == 0)
1286 return 0;
1287
Victor Stinner488fa492011-12-12 00:01:39 +01001288 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001289 return -1;
1290
1291 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1292 if (err) {
1293 PyErr_Format(PyExc_SystemError,
1294 "Cannot copy %s characters "
1295 "into a string of %s characters",
1296 unicode_kind_name(from),
1297 unicode_kind_name(to));
1298 return -1;
1299 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001300 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301}
1302
Victor Stinner17222162011-09-28 22:15:37 +02001303/* Find the maximum code point and count the number of surrogate pairs so a
1304 correct string length can be computed before converting a string to UCS4.
1305 This function counts single surrogates as a character and not as a pair.
1306
1307 Return 0 on success, or -1 on error. */
1308static int
1309find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1310 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311{
1312 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001313 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314
Victor Stinnerc53be962011-10-02 21:33:54 +02001315 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 *num_surrogates = 0;
1317 *maxchar = 0;
1318
1319 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001321 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1322 && (iter+1) < end
1323 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001325 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 iter += 2;
1328 }
1329 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001331 {
1332 ch = *iter;
1333 iter++;
1334 }
1335 if (ch > *maxchar) {
1336 *maxchar = ch;
1337 if (*maxchar > MAX_UNICODE) {
1338 PyErr_Format(PyExc_ValueError,
1339 "character U+%x is not in range [U+0000; U+10ffff]",
1340 ch);
1341 return -1;
1342 }
1343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 }
1345 return 0;
1346}
1347
1348#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001349static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350#endif
1351
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001352int
1353_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 wchar_t *end;
1356 Py_UCS4 maxchar = 0;
1357 Py_ssize_t num_surrogates;
1358#if SIZEOF_WCHAR_T == 2
1359 Py_ssize_t length_wo_surrogates;
1360#endif
1361
Georg Brandl7597add2011-10-05 16:36:47 +02001362 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001363 strings were created using _PyObject_New() and where no canonical
1364 representation (the str field) has been set yet aka strings
1365 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001366 assert(_PyUnicode_CHECK(unicode));
1367 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001370 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001371 /* Actually, it should neither be interned nor be anything else: */
1372 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373
1374#ifdef Py_DEBUG
1375 ++unicode_ready_calls;
1376#endif
1377
1378 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001379 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001380 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382
1383 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001384 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1385 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 PyErr_NoMemory();
1387 return -1;
1388 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001389 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 _PyUnicode_WSTR(unicode), end,
1391 PyUnicode_1BYTE_DATA(unicode));
1392 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1393 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1394 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1395 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001397 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001398 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001401 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001402 _PyUnicode_UTF8(unicode) = NULL;
1403 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 }
1405 PyObject_FREE(_PyUnicode_WSTR(unicode));
1406 _PyUnicode_WSTR(unicode) = NULL;
1407 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1408 }
1409 /* In this case we might have to convert down from 4-byte native
1410 wchar_t to 2-byte unicode. */
1411 else if (maxchar < 65536) {
1412 assert(num_surrogates == 0 &&
1413 "FindMaxCharAndNumSurrogatePairs() messed up");
1414
Victor Stinner506f5922011-09-28 22:34:18 +02001415#if SIZEOF_WCHAR_T == 2
1416 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001418 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1419 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1420 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001421 _PyUnicode_UTF8(unicode) = NULL;
1422 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001423#else
1424 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001425 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001426 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001428 PyErr_NoMemory();
1429 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 }
Victor Stinner506f5922011-09-28 22:34:18 +02001431 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1432 _PyUnicode_WSTR(unicode), end,
1433 PyUnicode_2BYTE_DATA(unicode));
1434 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1435 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1436 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001439 PyObject_FREE(_PyUnicode_WSTR(unicode));
1440 _PyUnicode_WSTR(unicode) = NULL;
1441 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1442#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 }
1444 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1445 else {
1446#if SIZEOF_WCHAR_T == 2
1447 /* in case the native representation is 2-bytes, we need to allocate a
1448 new normalized 4-byte version. */
1449 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001450 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1451 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 PyErr_NoMemory();
1453 return -1;
1454 }
1455 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1456 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001457 _PyUnicode_UTF8(unicode) = NULL;
1458 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001459 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1460 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001461 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 PyObject_FREE(_PyUnicode_WSTR(unicode));
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1465#else
1466 assert(num_surrogates == 0);
1467
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 _PyUnicode_UTF8(unicode) = NULL;
1471 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1473#endif
1474 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1475 }
1476 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001477 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 return 0;
1479}
1480
Alexander Belopolsky40018472011-02-26 01:02:56 +00001481static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001482unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483{
Walter Dörwald16807132007-05-25 13:52:07 +00001484 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 case SSTATE_NOT_INTERNED:
1486 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001487
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 case SSTATE_INTERNED_MORTAL:
1489 /* revive dead object temporarily for DelItem */
1490 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001491 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001492 Py_FatalError(
1493 "deletion of interned string failed");
1494 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001495
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 case SSTATE_INTERNED_IMMORTAL:
1497 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001498
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 default:
1500 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001501 }
1502
Victor Stinner03490912011-10-03 23:45:12 +02001503 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001505 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001506 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001507 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1508 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001509
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001510 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511}
1512
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513#ifdef Py_DEBUG
1514static int
1515unicode_is_singleton(PyObject *unicode)
1516{
1517 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1518 if (unicode == unicode_empty)
1519 return 1;
1520 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1521 {
1522 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1523 if (ch < 256 && unicode_latin1[ch] == unicode)
1524 return 1;
1525 }
1526 return 0;
1527}
1528#endif
1529
Alexander Belopolsky40018472011-02-26 01:02:56 +00001530static int
Victor Stinner488fa492011-12-12 00:01:39 +01001531unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001532{
Victor Stinner488fa492011-12-12 00:01:39 +01001533 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001534 if (Py_REFCNT(unicode) != 1)
1535 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001536 if (_PyUnicode_HASH(unicode) != -1)
1537 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001538 if (PyUnicode_CHECK_INTERNED(unicode))
1539 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001540 if (!PyUnicode_CheckExact(unicode))
1541 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001542#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001543 /* singleton refcount is greater than 1 */
1544 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001545#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001546 return 1;
1547}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001548
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549static int
1550unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1551{
1552 PyObject *unicode;
1553 Py_ssize_t old_length;
1554
1555 assert(p_unicode != NULL);
1556 unicode = *p_unicode;
1557
1558 assert(unicode != NULL);
1559 assert(PyUnicode_Check(unicode));
1560 assert(0 <= length);
1561
Victor Stinner910337b2011-10-03 03:20:16 +02001562 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001563 old_length = PyUnicode_WSTR_LENGTH(unicode);
1564 else
1565 old_length = PyUnicode_GET_LENGTH(unicode);
1566 if (old_length == length)
1567 return 0;
1568
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001569 if (length == 0) {
1570 Py_DECREF(*p_unicode);
1571 *p_unicode = unicode_empty;
1572 Py_INCREF(*p_unicode);
1573 return 0;
1574 }
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001577 PyObject *copy = resize_copy(unicode, length);
1578 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001579 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 Py_DECREF(*p_unicode);
1581 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583 }
1584
Victor Stinnerfe226c02011-10-03 03:52:20 +02001585 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001586 PyObject *new_unicode = resize_compact(unicode, length);
1587 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001589 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001590 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001591 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001592 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001593 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001594}
1595
Alexander Belopolsky40018472011-02-26 01:02:56 +00001596int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001597PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001598{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 PyObject *unicode;
1600 if (p_unicode == NULL) {
1601 PyErr_BadInternalCall();
1602 return -1;
1603 }
1604 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001605 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 {
1607 PyErr_BadInternalCall();
1608 return -1;
1609 }
1610 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001611}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001612
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001613static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001614unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001615{
1616 PyObject *result;
1617 assert(PyUnicode_IS_READY(*p_unicode));
1618 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1619 return 0;
1620 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1621 maxchar);
1622 if (result == NULL)
1623 return -1;
1624 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1625 PyUnicode_GET_LENGTH(*p_unicode));
1626 Py_DECREF(*p_unicode);
1627 *p_unicode = result;
1628 return 0;
1629}
1630
1631static int
1632unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1633 Py_UCS4 ch)
1634{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001635 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001636 if (unicode_widen(p_unicode, ch) < 0)
1637 return -1;
1638 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1639 PyUnicode_DATA(*p_unicode),
1640 (*pos)++, ch);
1641 return 0;
1642}
1643
Victor Stinnerc5166102012-02-22 13:55:02 +01001644/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1645 Return the length of the input string.
1646
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001647 WARNING: The function doesn't copy the terminating null character and
1648 doesn't check the maximum character (may write a latin1 character in an
1649 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001650static Py_ssize_t
1651unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1652{
1653 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1654 void *data = PyUnicode_DATA(unicode);
1655
1656 switch (kind) {
1657 case PyUnicode_1BYTE_KIND: {
1658 Py_ssize_t len = strlen(str);
1659 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001660 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001661 return len;
1662 }
1663 case PyUnicode_2BYTE_KIND: {
1664 Py_UCS2 *start = (Py_UCS2 *)data + index;
1665 Py_UCS2 *ucs2 = start;
1666 assert(index <= PyUnicode_GET_LENGTH(unicode));
1667
1668 for (; *str; ++ucs2, ++str)
1669 *ucs2 = (Py_UCS2)*str;
1670
1671 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1672 return ucs2 - start;
1673 }
1674 default: {
1675 Py_UCS4 *start = (Py_UCS4 *)data + index;
1676 Py_UCS4 *ucs4 = start;
1677 assert(kind == PyUnicode_4BYTE_KIND);
1678 assert(index <= PyUnicode_GET_LENGTH(unicode));
1679
1680 for (; *str; ++ucs4, ++str)
1681 *ucs4 = (Py_UCS4)*str;
1682
1683 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1684 return ucs4 - start;
1685 }
1686 }
1687}
1688
1689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690static PyObject*
1691get_latin1_char(unsigned char ch)
1692{
Victor Stinnera464fc12011-10-02 20:39:30 +02001693 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001695 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696 if (!unicode)
1697 return NULL;
1698 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001699 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 unicode_latin1[ch] = unicode;
1701 }
1702 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001703 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704}
1705
Alexander Belopolsky40018472011-02-26 01:02:56 +00001706PyObject *
1707PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001709 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 Py_UCS4 maxchar = 0;
1711 Py_ssize_t num_surrogates;
1712
1713 if (u == NULL)
1714 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001716 /* If the Unicode data is known at construction time, we can apply
1717 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 /* Optimization for empty strings */
1720 if (size == 0 && unicode_empty != NULL) {
1721 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001722 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001723 }
Tim Petersced69f82003-09-16 20:30:58 +00001724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 /* Single character Unicode objects in the Latin-1 range are
1726 shared when using this constructor */
1727 if (size == 1 && *u < 256)
1728 return get_latin1_char((unsigned char)*u);
1729
1730 /* If not empty and not single character, copy the Unicode data
1731 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001732 if (find_maxchar_surrogates(u, u + size,
1733 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 return NULL;
1735
Victor Stinner8faf8212011-12-08 22:14:11 +01001736 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 if (!unicode)
1738 return NULL;
1739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 switch (PyUnicode_KIND(unicode)) {
1741 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001742 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1744 break;
1745 case PyUnicode_2BYTE_KIND:
1746#if Py_UNICODE_SIZE == 2
1747 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1748#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001749 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1751#endif
1752 break;
1753 case PyUnicode_4BYTE_KIND:
1754#if SIZEOF_WCHAR_T == 2
1755 /* This is the only case which has to process surrogates, thus
1756 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001757 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758#else
1759 assert(num_surrogates == 0);
1760 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1761#endif
1762 break;
1763 default:
1764 assert(0 && "Impossible state");
1765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001767 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768}
1769
Alexander Belopolsky40018472011-02-26 01:02:56 +00001770PyObject *
1771PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001773 if (size < 0) {
1774 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001776 return NULL;
1777 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001778 if (u != NULL)
1779 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1780 else
1781 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001782}
1783
Alexander Belopolsky40018472011-02-26 01:02:56 +00001784PyObject *
1785PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001786{
1787 size_t size = strlen(u);
1788 if (size > PY_SSIZE_T_MAX) {
1789 PyErr_SetString(PyExc_OverflowError, "input too long");
1790 return NULL;
1791 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001792 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001793}
1794
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001795PyObject *
1796_PyUnicode_FromId(_Py_Identifier *id)
1797{
1798 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001799 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1800 strlen(id->string),
1801 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001802 if (!id->object)
1803 return NULL;
1804 PyUnicode_InternInPlace(&id->object);
1805 assert(!id->next);
1806 id->next = static_strings;
1807 static_strings = id;
1808 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001809 return id->object;
1810}
1811
1812void
1813_PyUnicode_ClearStaticStrings()
1814{
1815 _Py_Identifier *i;
1816 for (i = static_strings; i; i = i->next) {
1817 Py_DECREF(i->object);
1818 i->object = NULL;
1819 i->next = NULL;
1820 }
1821}
1822
Benjamin Peterson0df54292012-03-26 14:50:32 -04001823/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001824
Victor Stinnere57b1c02011-09-28 22:20:48 +02001825static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001826unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001827{
Victor Stinner785938e2011-12-11 20:09:03 +01001828 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001829 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001830#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001831 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001832#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001833 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001834 }
Victor Stinner785938e2011-12-11 20:09:03 +01001835 unicode = PyUnicode_New(size, 127);
1836 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001837 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001838 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1839 assert(_PyUnicode_CheckConsistency(unicode, 1));
1840 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001841}
1842
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001843static Py_UCS4
1844kind_maxchar_limit(unsigned int kind)
1845{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001846 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001847 case PyUnicode_1BYTE_KIND:
1848 return 0x80;
1849 case PyUnicode_2BYTE_KIND:
1850 return 0x100;
1851 case PyUnicode_4BYTE_KIND:
1852 return 0x10000;
1853 default:
1854 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001855 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001856 }
1857}
1858
Victor Stinner702c7342011-10-05 13:50:52 +02001859static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001860_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001863 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001864
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001865 if (size == 0) {
1866 Py_INCREF(unicode_empty);
1867 return unicode_empty;
1868 }
1869 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001870 if (size == 1)
1871 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001872
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001873 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001874 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 if (!res)
1876 return NULL;
1877 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001878 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001880}
1881
Victor Stinnere57b1c02011-09-28 22:20:48 +02001882static PyObject*
1883_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884{
1885 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001886 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001887
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001888 if (size == 0) {
1889 Py_INCREF(unicode_empty);
1890 return unicode_empty;
1891 }
1892 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001893 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001894 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001895
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001896 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001897 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 if (!res)
1899 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001900 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001902 else {
1903 _PyUnicode_CONVERT_BYTES(
1904 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1905 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001906 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 return res;
1908}
1909
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910static PyObject*
1911_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912{
1913 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001915
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916 if (size == 0) {
1917 Py_INCREF(unicode_empty);
1918 return unicode_empty;
1919 }
1920 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001921 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001922 return get_latin1_char((unsigned char)u[0]);
1923
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001924 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001925 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 if (!res)
1927 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001928 if (max_char < 256)
1929 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1930 PyUnicode_1BYTE_DATA(res));
1931 else if (max_char < 0x10000)
1932 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1933 PyUnicode_2BYTE_DATA(res));
1934 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001936 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 return res;
1938}
1939
1940PyObject*
1941PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1942{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001943 if (size < 0) {
1944 PyErr_SetString(PyExc_ValueError, "size must be positive");
1945 return NULL;
1946 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001947 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001949 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001951 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001953 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001954 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001955 PyErr_SetString(PyExc_SystemError, "invalid kind");
1956 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958}
1959
Victor Stinner25a4b292011-10-06 12:31:55 +02001960/* Ensure that a string uses the most efficient storage, if it is not the
1961 case: create a new string with of the right kind. Write NULL into *p_unicode
1962 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001963static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001964unicode_adjust_maxchar(PyObject **p_unicode)
1965{
1966 PyObject *unicode, *copy;
1967 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001968 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001969 unsigned int kind;
1970
1971 assert(p_unicode != NULL);
1972 unicode = *p_unicode;
1973 assert(PyUnicode_IS_READY(unicode));
1974 if (PyUnicode_IS_ASCII(unicode))
1975 return;
1976
1977 len = PyUnicode_GET_LENGTH(unicode);
1978 kind = PyUnicode_KIND(unicode);
1979 if (kind == PyUnicode_1BYTE_KIND) {
1980 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001981 max_char = ucs1lib_find_max_char(u, u + len);
1982 if (max_char >= 128)
1983 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001984 }
1985 else if (kind == PyUnicode_2BYTE_KIND) {
1986 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001987 max_char = ucs2lib_find_max_char(u, u + len);
1988 if (max_char >= 256)
1989 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001990 }
1991 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001992 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001993 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001994 max_char = ucs4lib_find_max_char(u, u + len);
1995 if (max_char >= 0x10000)
1996 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001997 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001998 copy = PyUnicode_New(len, max_char);
1999 copy_characters(copy, 0, unicode, 0, len);
2000 Py_DECREF(unicode);
2001 *p_unicode = copy;
2002}
2003
Victor Stinner034f6cf2011-09-30 02:26:44 +02002004PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002005_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002006{
Victor Stinner87af4f22011-11-21 23:03:47 +01002007 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002008 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002009
Victor Stinner034f6cf2011-09-30 02:26:44 +02002010 if (!PyUnicode_Check(unicode)) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002014 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002015 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002016
Victor Stinner87af4f22011-11-21 23:03:47 +01002017 length = PyUnicode_GET_LENGTH(unicode);
2018 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002019 if (!copy)
2020 return NULL;
2021 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2022
Victor Stinner87af4f22011-11-21 23:03:47 +01002023 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2024 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002025 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002026 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002027}
2028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029
Victor Stinnerbc603d12011-10-02 01:00:40 +02002030/* Widen Unicode objects to larger buffers. Don't write terminating null
2031 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032
2033void*
2034_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2035{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002036 Py_ssize_t len;
2037 void *result;
2038 unsigned int skind;
2039
Benjamin Petersonbac79492012-01-14 13:34:47 -05002040 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002041 return NULL;
2042
2043 len = PyUnicode_GET_LENGTH(s);
2044 skind = PyUnicode_KIND(s);
2045 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002046 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 return NULL;
2048 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002049 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002050 case PyUnicode_2BYTE_KIND:
2051 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2052 if (!result)
2053 return PyErr_NoMemory();
2054 assert(skind == PyUnicode_1BYTE_KIND);
2055 _PyUnicode_CONVERT_BYTES(
2056 Py_UCS1, Py_UCS2,
2057 PyUnicode_1BYTE_DATA(s),
2058 PyUnicode_1BYTE_DATA(s) + len,
2059 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002061 case PyUnicode_4BYTE_KIND:
2062 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2063 if (!result)
2064 return PyErr_NoMemory();
2065 if (skind == PyUnicode_2BYTE_KIND) {
2066 _PyUnicode_CONVERT_BYTES(
2067 Py_UCS2, Py_UCS4,
2068 PyUnicode_2BYTE_DATA(s),
2069 PyUnicode_2BYTE_DATA(s) + len,
2070 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002072 else {
2073 assert(skind == PyUnicode_1BYTE_KIND);
2074 _PyUnicode_CONVERT_BYTES(
2075 Py_UCS1, Py_UCS4,
2076 PyUnicode_1BYTE_DATA(s),
2077 PyUnicode_1BYTE_DATA(s) + len,
2078 result);
2079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002081 default:
2082 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 }
Victor Stinner01698042011-10-04 00:04:26 +02002084 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 return NULL;
2086}
2087
2088static Py_UCS4*
2089as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
2092 int kind;
2093 void *data;
2094 Py_ssize_t len, targetlen;
2095 if (PyUnicode_READY(string) == -1)
2096 return NULL;
2097 kind = PyUnicode_KIND(string);
2098 data = PyUnicode_DATA(string);
2099 len = PyUnicode_GET_LENGTH(string);
2100 targetlen = len;
2101 if (copy_null)
2102 targetlen++;
2103 if (!target) {
2104 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2105 PyErr_NoMemory();
2106 return NULL;
2107 }
2108 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2109 if (!target) {
2110 PyErr_NoMemory();
2111 return NULL;
2112 }
2113 }
2114 else {
2115 if (targetsize < targetlen) {
2116 PyErr_Format(PyExc_SystemError,
2117 "string is longer than the buffer");
2118 if (copy_null && 0 < targetsize)
2119 target[0] = 0;
2120 return NULL;
2121 }
2122 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002123 if (kind == PyUnicode_1BYTE_KIND) {
2124 Py_UCS1 *start = (Py_UCS1 *) data;
2125 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002127 else if (kind == PyUnicode_2BYTE_KIND) {
2128 Py_UCS2 *start = (Py_UCS2 *) data;
2129 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2130 }
2131 else {
2132 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 if (copy_null)
2136 target[len] = 0;
2137 return target;
2138}
2139
2140Py_UCS4*
2141PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2142 int copy_null)
2143{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002144 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 PyErr_BadInternalCall();
2146 return NULL;
2147 }
2148 return as_ucs4(string, target, targetsize, copy_null);
2149}
2150
2151Py_UCS4*
2152PyUnicode_AsUCS4Copy(PyObject *string)
2153{
2154 return as_ucs4(string, NULL, 0, 1);
2155}
2156
2157#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002158
Alexander Belopolsky40018472011-02-26 01:02:56 +00002159PyObject *
2160PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002163 if (size == 0) {
2164 Py_INCREF(unicode_empty);
2165 return unicode_empty;
2166 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002167 PyErr_BadInternalCall();
2168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 }
2170
Martin v. Löwis790465f2008-04-05 20:41:37 +00002171 if (size == -1) {
2172 size = wcslen(w);
2173 }
2174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002175 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176}
2177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002179
Walter Dörwald346737f2007-05-31 10:44:43 +00002180static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002181makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2182 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002183{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002184 *fmt++ = '%';
2185 if (width) {
2186 if (zeropad)
2187 *fmt++ = '0';
2188 fmt += sprintf(fmt, "%d", width);
2189 }
2190 if (precision)
2191 fmt += sprintf(fmt, ".%d", precision);
2192 if (longflag)
2193 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002194 else if (longlongflag) {
2195 /* longlongflag should only ever be nonzero on machines with
2196 HAVE_LONG_LONG defined */
2197#ifdef HAVE_LONG_LONG
2198 char *f = PY_FORMAT_LONG_LONG;
2199 while (*f)
2200 *fmt++ = *f++;
2201#else
2202 /* we shouldn't ever get here */
2203 assert(0);
2204 *fmt++ = 'l';
2205#endif
2206 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002207 else if (size_tflag) {
2208 char *f = PY_FORMAT_SIZE_T;
2209 while (*f)
2210 *fmt++ = *f++;
2211 }
2212 *fmt++ = c;
2213 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002214}
2215
Victor Stinner96865452011-03-01 23:44:09 +00002216/* helper for PyUnicode_FromFormatV() */
2217
2218static const char*
2219parse_format_flags(const char *f,
2220 int *p_width, int *p_precision,
2221 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2222{
2223 int width, precision, longflag, longlongflag, size_tflag;
2224
2225 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2226 f++;
2227 width = 0;
2228 while (Py_ISDIGIT((unsigned)*f))
2229 width = (width*10) + *f++ - '0';
2230 precision = 0;
2231 if (*f == '.') {
2232 f++;
2233 while (Py_ISDIGIT((unsigned)*f))
2234 precision = (precision*10) + *f++ - '0';
2235 if (*f == '%') {
2236 /* "%.3%s" => f points to "3" */
2237 f--;
2238 }
2239 }
2240 if (*f == '\0') {
2241 /* bogus format "%.1" => go backward, f points to "1" */
2242 f--;
2243 }
2244 if (p_width != NULL)
2245 *p_width = width;
2246 if (p_precision != NULL)
2247 *p_precision = precision;
2248
2249 /* Handle %ld, %lu, %lld and %llu. */
2250 longflag = 0;
2251 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002252 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002253
2254 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002255 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002256 longflag = 1;
2257 ++f;
2258 }
2259#ifdef HAVE_LONG_LONG
2260 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002261 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002262 longlongflag = 1;
2263 f += 2;
2264 }
2265#endif
2266 }
2267 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002268 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002269 size_tflag = 1;
2270 ++f;
2271 }
2272 if (p_longflag != NULL)
2273 *p_longflag = longflag;
2274 if (p_longlongflag != NULL)
2275 *p_longlongflag = longlongflag;
2276 if (p_size_tflag != NULL)
2277 *p_size_tflag = size_tflag;
2278 return f;
2279}
2280
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002281/* maximum number of characters required for output of %ld. 21 characters
2282 allows for 64-bit integers (in decimal) and an optional sign. */
2283#define MAX_LONG_CHARS 21
2284/* maximum number of characters required for output of %lld.
2285 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2286 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2287#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2288
Walter Dörwaldd2034312007-05-18 16:29:38 +00002289PyObject *
2290PyUnicode_FromFormatV(const char *format, va_list vargs)
2291{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002292 va_list count;
2293 Py_ssize_t callcount = 0;
2294 PyObject **callresults = NULL;
2295 PyObject **callresult = NULL;
2296 Py_ssize_t n = 0;
2297 int width = 0;
2298 int precision = 0;
2299 int zeropad;
2300 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002301 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002302 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002303 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002304 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2305 Py_UCS4 argmaxchar;
2306 Py_ssize_t numbersize = 0;
2307 char *numberresults = NULL;
2308 char *numberresult = NULL;
2309 Py_ssize_t i;
2310 int kind;
2311 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002312
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002313 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002314 /* step 1: count the number of %S/%R/%A/%s format specifications
2315 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2316 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002318 * also estimate a upper bound for all the number formats in the string,
2319 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002321 for (f = format; *f; f++) {
2322 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002323 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002324 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2325 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2326 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2327 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002330#ifdef HAVE_LONG_LONG
2331 if (longlongflag) {
2332 if (width < MAX_LONG_LONG_CHARS)
2333 width = MAX_LONG_LONG_CHARS;
2334 }
2335 else
2336#endif
2337 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2338 including sign. Decimal takes the most space. This
2339 isn't enough for octal. If a width is specified we
2340 need more (which we allocate later). */
2341 if (width < MAX_LONG_CHARS)
2342 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343
2344 /* account for the size + '\0' to separate numbers
2345 inside of the numberresults buffer */
2346 numbersize += (width + 1);
2347 }
2348 }
2349 else if ((unsigned char)*f > 127) {
2350 PyErr_Format(PyExc_ValueError,
2351 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2352 "string, got a non-ASCII byte: 0x%02x",
2353 (unsigned char)*f);
2354 return NULL;
2355 }
2356 }
2357 /* step 2: allocate memory for the results of
2358 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2359 if (callcount) {
2360 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2361 if (!callresults) {
2362 PyErr_NoMemory();
2363 return NULL;
2364 }
2365 callresult = callresults;
2366 }
2367 /* step 2.5: allocate memory for the results of formating numbers */
2368 if (numbersize) {
2369 numberresults = PyObject_Malloc(numbersize);
2370 if (!numberresults) {
2371 PyErr_NoMemory();
2372 goto fail;
2373 }
2374 numberresult = numberresults;
2375 }
2376
2377 /* step 3: format numbers and figure out how large a buffer we need */
2378 for (f = format; *f; f++) {
2379 if (*f == '%') {
2380 const char* p;
2381 int longflag;
2382 int longlongflag;
2383 int size_tflag;
2384 int numprinted;
2385
2386 p = f;
2387 zeropad = (f[1] == '0');
2388 f = parse_format_flags(f, &width, &precision,
2389 &longflag, &longlongflag, &size_tflag);
2390 switch (*f) {
2391 case 'c':
2392 {
2393 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002394 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 n++;
2396 break;
2397 }
2398 case '%':
2399 n++;
2400 break;
2401 case 'i':
2402 case 'd':
2403 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2404 width, precision, *f);
2405 if (longflag)
2406 numprinted = sprintf(numberresult, fmt,
2407 va_arg(count, long));
2408#ifdef HAVE_LONG_LONG
2409 else if (longlongflag)
2410 numprinted = sprintf(numberresult, fmt,
2411 va_arg(count, PY_LONG_LONG));
2412#endif
2413 else if (size_tflag)
2414 numprinted = sprintf(numberresult, fmt,
2415 va_arg(count, Py_ssize_t));
2416 else
2417 numprinted = sprintf(numberresult, fmt,
2418 va_arg(count, int));
2419 n += numprinted;
2420 /* advance by +1 to skip over the '\0' */
2421 numberresult += (numprinted + 1);
2422 assert(*(numberresult - 1) == '\0');
2423 assert(*(numberresult - 2) != '\0');
2424 assert(numprinted >= 0);
2425 assert(numberresult <= numberresults + numbersize);
2426 break;
2427 case 'u':
2428 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2429 width, precision, 'u');
2430 if (longflag)
2431 numprinted = sprintf(numberresult, fmt,
2432 va_arg(count, unsigned long));
2433#ifdef HAVE_LONG_LONG
2434 else if (longlongflag)
2435 numprinted = sprintf(numberresult, fmt,
2436 va_arg(count, unsigned PY_LONG_LONG));
2437#endif
2438 else if (size_tflag)
2439 numprinted = sprintf(numberresult, fmt,
2440 va_arg(count, size_t));
2441 else
2442 numprinted = sprintf(numberresult, fmt,
2443 va_arg(count, unsigned int));
2444 n += numprinted;
2445 numberresult += (numprinted + 1);
2446 assert(*(numberresult - 1) == '\0');
2447 assert(*(numberresult - 2) != '\0');
2448 assert(numprinted >= 0);
2449 assert(numberresult <= numberresults + numbersize);
2450 break;
2451 case 'x':
2452 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2453 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2454 n += numprinted;
2455 numberresult += (numprinted + 1);
2456 assert(*(numberresult - 1) == '\0');
2457 assert(*(numberresult - 2) != '\0');
2458 assert(numprinted >= 0);
2459 assert(numberresult <= numberresults + numbersize);
2460 break;
2461 case 'p':
2462 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2463 /* %p is ill-defined: ensure leading 0x. */
2464 if (numberresult[1] == 'X')
2465 numberresult[1] = 'x';
2466 else if (numberresult[1] != 'x') {
2467 memmove(numberresult + 2, numberresult,
2468 strlen(numberresult) + 1);
2469 numberresult[0] = '0';
2470 numberresult[1] = 'x';
2471 numprinted += 2;
2472 }
2473 n += numprinted;
2474 numberresult += (numprinted + 1);
2475 assert(*(numberresult - 1) == '\0');
2476 assert(*(numberresult - 2) != '\0');
2477 assert(numprinted >= 0);
2478 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002479 break;
2480 case 's':
2481 {
2482 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002483 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002484 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002485 if (!str)
2486 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 /* since PyUnicode_DecodeUTF8 returns already flexible
2488 unicode objects, there is no need to call ready on them */
2489 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002490 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002492 /* Remember the str and switch to the next slot */
2493 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002494 break;
2495 }
2496 case 'U':
2497 {
2498 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002499 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 if (PyUnicode_READY(obj) == -1)
2501 goto fail;
2502 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002503 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 break;
2506 }
2507 case 'V':
2508 {
2509 PyObject *obj = va_arg(count, PyObject *);
2510 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002511 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002512 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002513 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002514 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 if (PyUnicode_READY(obj) == -1)
2516 goto fail;
2517 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002518 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002520 *callresult++ = NULL;
2521 }
2522 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002523 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002524 if (!str_obj)
2525 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002526 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002527 Py_DECREF(str_obj);
2528 goto fail;
2529 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002530 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002531 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002533 *callresult++ = str_obj;
2534 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002535 break;
2536 }
2537 case 'S':
2538 {
2539 PyObject *obj = va_arg(count, PyObject *);
2540 PyObject *str;
2541 assert(obj);
2542 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002543 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002545 if (PyUnicode_READY(str) == -1) {
2546 Py_DECREF(str);
2547 goto fail;
2548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002550 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002552 /* Remember the str and switch to the next slot */
2553 *callresult++ = str;
2554 break;
2555 }
2556 case 'R':
2557 {
2558 PyObject *obj = va_arg(count, PyObject *);
2559 PyObject *repr;
2560 assert(obj);
2561 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002562 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002564 if (PyUnicode_READY(repr) == -1) {
2565 Py_DECREF(repr);
2566 goto fail;
2567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002569 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002571 /* Remember the repr and switch to the next slot */
2572 *callresult++ = repr;
2573 break;
2574 }
2575 case 'A':
2576 {
2577 PyObject *obj = va_arg(count, PyObject *);
2578 PyObject *ascii;
2579 assert(obj);
2580 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002581 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002583 if (PyUnicode_READY(ascii) == -1) {
2584 Py_DECREF(ascii);
2585 goto fail;
2586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002588 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 /* Remember the repr and switch to the next slot */
2591 *callresult++ = ascii;
2592 break;
2593 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 default:
2595 /* if we stumble upon an unknown
2596 formatting code, copy the rest of
2597 the format string to the output
2598 string. (we cannot just skip the
2599 code, since there's no way to know
2600 what's in the argument list) */
2601 n += strlen(p);
2602 goto expand;
2603 }
2604 } else
2605 n++;
2606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002608 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002610 we don't have to resize the string.
2611 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002612 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002613 if (!string)
2614 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 kind = PyUnicode_KIND(string);
2616 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002617 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002622 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002623
2624 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2626 /* checking for == because the last argument could be a empty
2627 string, which causes i to point to end, the assert at the end of
2628 the loop */
2629 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002630
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 switch (*f) {
2632 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002633 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 const int ordinal = va_arg(vargs, int);
2635 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002637 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002638 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002640 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002643 {
2644 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 /* unused, since we already have the result */
2646 if (*f == 'p')
2647 (void) va_arg(vargs, void *);
2648 else
2649 (void) va_arg(vargs, int);
2650 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002651 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002653 i += written;
2654 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 assert(*numberresult == '\0');
2656 numberresult++;
2657 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002659 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 case 's':
2661 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002662 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002664 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002665 size = PyUnicode_GET_LENGTH(*callresult);
2666 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002667 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002669 /* We're done with the unicode()/repr() => forget it */
2670 Py_DECREF(*callresult);
2671 /* switch to next unicode()/repr() result */
2672 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 break;
2674 }
2675 case 'U':
2676 {
2677 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 Py_ssize_t size;
2679 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2680 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002681 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 break;
2684 }
2685 case 'V':
2686 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002689 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002691 size = PyUnicode_GET_LENGTH(obj);
2692 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002693 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002695 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002696 size = PyUnicode_GET_LENGTH(*callresult);
2697 assert(PyUnicode_KIND(*callresult) <=
2698 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002699 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002701 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002703 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 break;
2705 }
2706 case 'S':
2707 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002708 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002710 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 /* unused, since we already have the result */
2712 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002714 copy_characters(string, i, *callresult, 0, size);
2715 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 /* We're done with the unicode()/repr() => forget it */
2717 Py_DECREF(*callresult);
2718 /* switch to next unicode()/repr() result */
2719 ++callresult;
2720 break;
2721 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002722 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002724 break;
2725 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002726 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 goto end;
2729 }
Victor Stinner1205f272010-09-11 00:54:47 +00002730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 else {
2732 assert(i < PyUnicode_GET_LENGTH(string));
2733 PyUnicode_WRITE(kind, data, i++, *f);
2734 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002737
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 if (callresults)
2740 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 if (numberresults)
2742 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002743 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002744 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 if (callresults) {
2746 PyObject **callresult2 = callresults;
2747 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002748 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 ++callresult2;
2750 }
2751 PyObject_Free(callresults);
2752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 if (numberresults)
2754 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002755 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002756}
2757
Walter Dörwaldd2034312007-05-18 16:29:38 +00002758PyObject *
2759PyUnicode_FromFormat(const char *format, ...)
2760{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002761 PyObject* ret;
2762 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002763
2764#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002765 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002766#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002767 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002768#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 ret = PyUnicode_FromFormatV(format, vargs);
2770 va_end(vargs);
2771 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002772}
2773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774#ifdef HAVE_WCHAR_H
2775
Victor Stinner5593d8a2010-10-02 11:11:27 +00002776/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2777 convert a Unicode object to a wide character string.
2778
Victor Stinnerd88d9832011-09-06 02:00:05 +02002779 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780 character) required to convert the unicode object. Ignore size argument.
2781
Victor Stinnerd88d9832011-09-06 02:00:05 +02002782 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002783 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002784 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002786unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002787 wchar_t *w,
2788 Py_ssize_t size)
2789{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002790 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 const wchar_t *wstr;
2792
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002793 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002794 if (wstr == NULL)
2795 return -1;
2796
Victor Stinner5593d8a2010-10-02 11:11:27 +00002797 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002798 if (size > res)
2799 size = res + 1;
2800 else
2801 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002803 return res;
2804 }
2805 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002807}
2808
2809Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002810PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002811 wchar_t *w,
2812 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813{
2814 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002815 PyErr_BadInternalCall();
2816 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002818 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819}
2820
Victor Stinner137c34c2010-09-29 10:25:54 +00002821wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002822PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002823 Py_ssize_t *size)
2824{
2825 wchar_t* buffer;
2826 Py_ssize_t buflen;
2827
2828 if (unicode == NULL) {
2829 PyErr_BadInternalCall();
2830 return NULL;
2831 }
2832
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002833 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002834 if (buflen == -1)
2835 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002836 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002837 PyErr_NoMemory();
2838 return NULL;
2839 }
2840
Victor Stinner137c34c2010-09-29 10:25:54 +00002841 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2842 if (buffer == NULL) {
2843 PyErr_NoMemory();
2844 return NULL;
2845 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002846 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002847 if (buflen == -1)
2848 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002849 if (size != NULL)
2850 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002851 return buffer;
2852}
2853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002854#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855
Alexander Belopolsky40018472011-02-26 01:02:56 +00002856PyObject *
2857PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002859 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002860 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002861 PyErr_SetString(PyExc_ValueError,
2862 "chr() arg not in range(0x110000)");
2863 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002864 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002866 if (ordinal < 256)
2867 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002869 v = PyUnicode_New(1, ordinal);
2870 if (v == NULL)
2871 return NULL;
2872 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002873 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002874 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002875}
2876
Alexander Belopolsky40018472011-02-26 01:02:56 +00002877PyObject *
2878PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002880 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002882 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002883 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002884 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 Py_INCREF(obj);
2886 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002887 }
2888 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 /* For a Unicode subtype that's not a Unicode object,
2890 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002891 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002893 PyErr_Format(PyExc_TypeError,
2894 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002895 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002896 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002897}
2898
Alexander Belopolsky40018472011-02-26 01:02:56 +00002899PyObject *
2900PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002901 const char *encoding,
2902 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002903{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002904 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002905 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002906
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002908 PyErr_BadInternalCall();
2909 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002911
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002912 /* Decoding bytes objects is the most common case and should be fast */
2913 if (PyBytes_Check(obj)) {
2914 if (PyBytes_GET_SIZE(obj) == 0) {
2915 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002916 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002917 }
2918 else {
2919 v = PyUnicode_Decode(
2920 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2921 encoding, errors);
2922 }
2923 return v;
2924 }
2925
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002926 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002927 PyErr_SetString(PyExc_TypeError,
2928 "decoding str is not supported");
2929 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002931
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002932 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2933 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2934 PyErr_Format(PyExc_TypeError,
2935 "coercing to str: need bytes, bytearray "
2936 "or buffer-like object, %.80s found",
2937 Py_TYPE(obj)->tp_name);
2938 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002939 }
Tim Petersced69f82003-09-16 20:30:58 +00002940
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002941 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002942 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002943 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 }
Tim Petersced69f82003-09-16 20:30:58 +00002945 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002946 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002947
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002948 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002949 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950}
2951
Victor Stinner600d3be2010-06-10 12:00:55 +00002952/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002953 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2954 1 on success. */
2955static int
2956normalize_encoding(const char *encoding,
2957 char *lower,
2958 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002960 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002961 char *l;
2962 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002964 if (encoding == NULL) {
2965 strcpy(lower, "utf-8");
2966 return 1;
2967 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002968 e = encoding;
2969 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002970 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002971 while (*e) {
2972 if (l == l_end)
2973 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002974 if (Py_ISUPPER(*e)) {
2975 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002976 }
2977 else if (*e == '_') {
2978 *l++ = '-';
2979 e++;
2980 }
2981 else {
2982 *l++ = *e++;
2983 }
2984 }
2985 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002986 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002987}
2988
Alexander Belopolsky40018472011-02-26 01:02:56 +00002989PyObject *
2990PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002991 Py_ssize_t size,
2992 const char *encoding,
2993 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002994{
2995 PyObject *buffer = NULL, *unicode;
2996 Py_buffer info;
2997 char lower[11]; /* Enough for any encoding shortcut */
2998
Fred Drakee4315f52000-05-09 19:53:39 +00002999 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003000 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003001 if ((strcmp(lower, "utf-8") == 0) ||
3002 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003003 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003004 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003005 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003006 (strcmp(lower, "iso-8859-1") == 0))
3007 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003008#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003009 else if (strcmp(lower, "mbcs") == 0)
3010 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003011#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003012 else if (strcmp(lower, "ascii") == 0)
3013 return PyUnicode_DecodeASCII(s, size, errors);
3014 else if (strcmp(lower, "utf-16") == 0)
3015 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3016 else if (strcmp(lower, "utf-32") == 0)
3017 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019
3020 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003021 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003022 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003023 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003024 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 if (buffer == NULL)
3026 goto onError;
3027 unicode = PyCodec_Decode(buffer, encoding, errors);
3028 if (unicode == NULL)
3029 goto onError;
3030 if (!PyUnicode_Check(unicode)) {
3031 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003032 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003033 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 Py_DECREF(unicode);
3035 goto onError;
3036 }
3037 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003038 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003039
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 Py_XDECREF(buffer);
3042 return NULL;
3043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
3046PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003047 const char *encoding,
3048 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003049{
3050 PyObject *v;
3051
3052 if (!PyUnicode_Check(unicode)) {
3053 PyErr_BadArgument();
3054 goto onError;
3055 }
3056
3057 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003059
3060 /* Decode via the codec registry */
3061 v = PyCodec_Decode(unicode, encoding, errors);
3062 if (v == NULL)
3063 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003064 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003065
Benjamin Peterson29060642009-01-31 22:14:21 +00003066 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003067 return NULL;
3068}
3069
Alexander Belopolsky40018472011-02-26 01:02:56 +00003070PyObject *
3071PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003072 const char *encoding,
3073 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003074{
3075 PyObject *v;
3076
3077 if (!PyUnicode_Check(unicode)) {
3078 PyErr_BadArgument();
3079 goto onError;
3080 }
3081
3082 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003083 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003084
3085 /* Decode via the codec registry */
3086 v = PyCodec_Decode(unicode, encoding, errors);
3087 if (v == NULL)
3088 goto onError;
3089 if (!PyUnicode_Check(v)) {
3090 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003091 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003092 Py_TYPE(v)->tp_name);
3093 Py_DECREF(v);
3094 goto onError;
3095 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003096 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003097
Benjamin Peterson29060642009-01-31 22:14:21 +00003098 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003099 return NULL;
3100}
3101
Alexander Belopolsky40018472011-02-26 01:02:56 +00003102PyObject *
3103PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003104 Py_ssize_t size,
3105 const char *encoding,
3106 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107{
3108 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003109
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 unicode = PyUnicode_FromUnicode(s, size);
3111 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3114 Py_DECREF(unicode);
3115 return v;
3116}
3117
Alexander Belopolsky40018472011-02-26 01:02:56 +00003118PyObject *
3119PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003120 const char *encoding,
3121 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003122{
3123 PyObject *v;
3124
3125 if (!PyUnicode_Check(unicode)) {
3126 PyErr_BadArgument();
3127 goto onError;
3128 }
3129
3130 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003131 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003132
3133 /* Encode via the codec registry */
3134 v = PyCodec_Encode(unicode, encoding, errors);
3135 if (v == NULL)
3136 goto onError;
3137 return v;
3138
Benjamin Peterson29060642009-01-31 22:14:21 +00003139 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003140 return NULL;
3141}
3142
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003143static size_t
3144wcstombs_errorpos(const wchar_t *wstr)
3145{
3146 size_t len;
3147#if SIZEOF_WCHAR_T == 2
3148 wchar_t buf[3];
3149#else
3150 wchar_t buf[2];
3151#endif
3152 char outbuf[MB_LEN_MAX];
3153 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003154
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003155#if SIZEOF_WCHAR_T == 2
3156 buf[2] = 0;
3157#else
3158 buf[1] = 0;
3159#endif
3160 start = wstr;
3161 while (*wstr != L'\0')
3162 {
3163 previous = wstr;
3164#if SIZEOF_WCHAR_T == 2
3165 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3166 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3167 {
3168 buf[0] = wstr[0];
3169 buf[1] = wstr[1];
3170 wstr += 2;
3171 }
3172 else {
3173 buf[0] = *wstr;
3174 buf[1] = 0;
3175 wstr++;
3176 }
3177#else
3178 buf[0] = *wstr;
3179 wstr++;
3180#endif
3181 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003182 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003183 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003184 }
3185
3186 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003187 return 0;
3188}
3189
Victor Stinner1b579672011-12-17 05:47:23 +01003190static int
3191locale_error_handler(const char *errors, int *surrogateescape)
3192{
3193 if (errors == NULL) {
3194 *surrogateescape = 0;
3195 return 0;
3196 }
3197
3198 if (strcmp(errors, "strict") == 0) {
3199 *surrogateescape = 0;
3200 return 0;
3201 }
3202 if (strcmp(errors, "surrogateescape") == 0) {
3203 *surrogateescape = 1;
3204 return 0;
3205 }
3206 PyErr_Format(PyExc_ValueError,
3207 "only 'strict' and 'surrogateescape' error handlers "
3208 "are supported, not '%s'",
3209 errors);
3210 return -1;
3211}
3212
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003213PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003214PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003215{
3216 Py_ssize_t wlen, wlen2;
3217 wchar_t *wstr;
3218 PyObject *bytes = NULL;
3219 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003220 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003221 PyObject *exc;
3222 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003223 int surrogateescape;
3224
3225 if (locale_error_handler(errors, &surrogateescape) < 0)
3226 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003227
3228 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3229 if (wstr == NULL)
3230 return NULL;
3231
3232 wlen2 = wcslen(wstr);
3233 if (wlen2 != wlen) {
3234 PyMem_Free(wstr);
3235 PyErr_SetString(PyExc_TypeError, "embedded null character");
3236 return NULL;
3237 }
3238
3239 if (surrogateescape) {
3240 /* locale encoding with surrogateescape */
3241 char *str;
3242
3243 str = _Py_wchar2char(wstr, &error_pos);
3244 if (str == NULL) {
3245 if (error_pos == (size_t)-1) {
3246 PyErr_NoMemory();
3247 PyMem_Free(wstr);
3248 return NULL;
3249 }
3250 else {
3251 goto encode_error;
3252 }
3253 }
3254 PyMem_Free(wstr);
3255
3256 bytes = PyBytes_FromString(str);
3257 PyMem_Free(str);
3258 }
3259 else {
3260 size_t len, len2;
3261
3262 len = wcstombs(NULL, wstr, 0);
3263 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003264 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003265 goto encode_error;
3266 }
3267
3268 bytes = PyBytes_FromStringAndSize(NULL, len);
3269 if (bytes == NULL) {
3270 PyMem_Free(wstr);
3271 return NULL;
3272 }
3273
3274 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3275 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003276 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003277 goto encode_error;
3278 }
3279 PyMem_Free(wstr);
3280 }
3281 return bytes;
3282
3283encode_error:
3284 errmsg = strerror(errno);
3285 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003286
3287 if (error_pos == (size_t)-1)
3288 error_pos = wcstombs_errorpos(wstr);
3289
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290 PyMem_Free(wstr);
3291 Py_XDECREF(bytes);
3292
Victor Stinner2f197072011-12-17 07:08:30 +01003293 if (errmsg != NULL) {
3294 size_t errlen;
3295 wstr = _Py_char2wchar(errmsg, &errlen);
3296 if (wstr != NULL) {
3297 reason = PyUnicode_FromWideChar(wstr, errlen);
3298 PyMem_Free(wstr);
3299 } else
3300 errmsg = NULL;
3301 }
3302 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003303 reason = PyUnicode_FromString(
3304 "wcstombs() encountered an unencodable "
3305 "wide character");
3306 if (reason == NULL)
3307 return NULL;
3308
3309 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3310 "locale", unicode,
3311 (Py_ssize_t)error_pos,
3312 (Py_ssize_t)(error_pos+1),
3313 reason);
3314 Py_DECREF(reason);
3315 if (exc != NULL) {
3316 PyCodec_StrictErrors(exc);
3317 Py_XDECREF(exc);
3318 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003319 return NULL;
3320}
3321
Victor Stinnerad158722010-10-27 00:25:46 +00003322PyObject *
3323PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003324{
Victor Stinner99b95382011-07-04 14:23:54 +02003325#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003326 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003327#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003328 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003329#else
Victor Stinner793b5312011-04-27 00:24:21 +02003330 PyInterpreterState *interp = PyThreadState_GET()->interp;
3331 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3332 cannot use it to encode and decode filenames before it is loaded. Load
3333 the Python codec requires to encode at least its own filename. Use the C
3334 version of the locale codec until the codec registry is initialized and
3335 the Python codec is loaded.
3336
3337 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3338 cannot only rely on it: check also interp->fscodec_initialized for
3339 subinterpreters. */
3340 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003341 return PyUnicode_AsEncodedString(unicode,
3342 Py_FileSystemDefaultEncoding,
3343 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003344 }
3345 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003346 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003347 }
Victor Stinnerad158722010-10-27 00:25:46 +00003348#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003349}
3350
Alexander Belopolsky40018472011-02-26 01:02:56 +00003351PyObject *
3352PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003353 const char *encoding,
3354 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355{
3356 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003357 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003358
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 if (!PyUnicode_Check(unicode)) {
3360 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 }
Fred Drakee4315f52000-05-09 19:53:39 +00003363
Fred Drakee4315f52000-05-09 19:53:39 +00003364 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003365 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003366 if ((strcmp(lower, "utf-8") == 0) ||
3367 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003368 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003369 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003370 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003371 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003372 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003373 }
Victor Stinner37296e82010-06-10 13:36:23 +00003374 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003375 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003376 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003377 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003378#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003379 else if (strcmp(lower, "mbcs") == 0)
3380 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003381#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003382 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003383 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385
3386 /* Encode via the codec registry */
3387 v = PyCodec_Encode(unicode, encoding, errors);
3388 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003389 return NULL;
3390
3391 /* The normal path */
3392 if (PyBytes_Check(v))
3393 return v;
3394
3395 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003396 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003397 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003398 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003399
3400 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3401 "encoder %s returned bytearray instead of bytes",
3402 encoding);
3403 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003404 Py_DECREF(v);
3405 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003406 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003407
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003408 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3409 Py_DECREF(v);
3410 return b;
3411 }
3412
3413 PyErr_Format(PyExc_TypeError,
3414 "encoder did not return a bytes object (type=%.400s)",
3415 Py_TYPE(v)->tp_name);
3416 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003417 return NULL;
3418}
3419
Alexander Belopolsky40018472011-02-26 01:02:56 +00003420PyObject *
3421PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003422 const char *encoding,
3423 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003424{
3425 PyObject *v;
3426
3427 if (!PyUnicode_Check(unicode)) {
3428 PyErr_BadArgument();
3429 goto onError;
3430 }
3431
3432 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003434
3435 /* Encode via the codec registry */
3436 v = PyCodec_Encode(unicode, encoding, errors);
3437 if (v == NULL)
3438 goto onError;
3439 if (!PyUnicode_Check(v)) {
3440 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003441 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003442 Py_TYPE(v)->tp_name);
3443 Py_DECREF(v);
3444 goto onError;
3445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003447
Benjamin Peterson29060642009-01-31 22:14:21 +00003448 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 return NULL;
3450}
3451
Victor Stinner2f197072011-12-17 07:08:30 +01003452static size_t
3453mbstowcs_errorpos(const char *str, size_t len)
3454{
3455#ifdef HAVE_MBRTOWC
3456 const char *start = str;
3457 mbstate_t mbs;
3458 size_t converted;
3459 wchar_t ch;
3460
3461 memset(&mbs, 0, sizeof mbs);
3462 while (len)
3463 {
3464 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3465 if (converted == 0)
3466 /* Reached end of string */
3467 break;
3468 if (converted == (size_t)-1 || converted == (size_t)-2) {
3469 /* Conversion error or incomplete character */
3470 return str - start;
3471 }
3472 else {
3473 str += converted;
3474 len -= converted;
3475 }
3476 }
3477 /* failed to find the undecodable byte sequence */
3478 return 0;
3479#endif
3480 return 0;
3481}
3482
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003483PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003484PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003485 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003486{
3487 wchar_t smallbuf[256];
3488 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3489 wchar_t *wstr;
3490 size_t wlen, wlen2;
3491 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003492 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003493 size_t error_pos;
3494 char *errmsg;
3495 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003496
3497 if (locale_error_handler(errors, &surrogateescape) < 0)
3498 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003499
3500 if (str[len] != '\0' || len != strlen(str)) {
3501 PyErr_SetString(PyExc_TypeError, "embedded null character");
3502 return NULL;
3503 }
3504
3505 if (surrogateescape)
3506 {
3507 wstr = _Py_char2wchar(str, &wlen);
3508 if (wstr == NULL) {
3509 if (wlen == (size_t)-1)
3510 PyErr_NoMemory();
3511 else
3512 PyErr_SetFromErrno(PyExc_OSError);
3513 return NULL;
3514 }
3515
3516 unicode = PyUnicode_FromWideChar(wstr, wlen);
3517 PyMem_Free(wstr);
3518 }
3519 else {
3520#ifndef HAVE_BROKEN_MBSTOWCS
3521 wlen = mbstowcs(NULL, str, 0);
3522#else
3523 wlen = len;
3524#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003525 if (wlen == (size_t)-1)
3526 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003527 if (wlen+1 <= smallbuf_len) {
3528 wstr = smallbuf;
3529 }
3530 else {
3531 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3532 return PyErr_NoMemory();
3533
3534 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3535 if (!wstr)
3536 return PyErr_NoMemory();
3537 }
3538
3539 /* This shouldn't fail now */
3540 wlen2 = mbstowcs(wstr, str, wlen+1);
3541 if (wlen2 == (size_t)-1) {
3542 if (wstr != smallbuf)
3543 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003544 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003545 }
3546#ifdef HAVE_BROKEN_MBSTOWCS
3547 assert(wlen2 == wlen);
3548#endif
3549 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3550 if (wstr != smallbuf)
3551 PyMem_Free(wstr);
3552 }
3553 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003554
3555decode_error:
3556 errmsg = strerror(errno);
3557 assert(errmsg != NULL);
3558
3559 error_pos = mbstowcs_errorpos(str, len);
3560 if (errmsg != NULL) {
3561 size_t errlen;
3562 wstr = _Py_char2wchar(errmsg, &errlen);
3563 if (wstr != NULL) {
3564 reason = PyUnicode_FromWideChar(wstr, errlen);
3565 PyMem_Free(wstr);
3566 } else
3567 errmsg = NULL;
3568 }
3569 if (errmsg == NULL)
3570 reason = PyUnicode_FromString(
3571 "mbstowcs() encountered an invalid multibyte sequence");
3572 if (reason == NULL)
3573 return NULL;
3574
3575 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3576 "locale", str, len,
3577 (Py_ssize_t)error_pos,
3578 (Py_ssize_t)(error_pos+1),
3579 reason);
3580 Py_DECREF(reason);
3581 if (exc != NULL) {
3582 PyCodec_StrictErrors(exc);
3583 Py_XDECREF(exc);
3584 }
3585 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003586}
3587
3588PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003589PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003590{
3591 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003592 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003593}
3594
3595
3596PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003597PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003598 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003599 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3600}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003601
Christian Heimes5894ba72007-11-04 11:43:14 +00003602PyObject*
3603PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3604{
Victor Stinner99b95382011-07-04 14:23:54 +02003605#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003606 return PyUnicode_DecodeMBCS(s, size, NULL);
3607#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003608 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003609#else
Victor Stinner793b5312011-04-27 00:24:21 +02003610 PyInterpreterState *interp = PyThreadState_GET()->interp;
3611 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3612 cannot use it to encode and decode filenames before it is loaded. Load
3613 the Python codec requires to encode at least its own filename. Use the C
3614 version of the locale codec until the codec registry is initialized and
3615 the Python codec is loaded.
3616
3617 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3618 cannot only rely on it: check also interp->fscodec_initialized for
3619 subinterpreters. */
3620 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003621 return PyUnicode_Decode(s, size,
3622 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003623 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003624 }
3625 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003626 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003627 }
Victor Stinnerad158722010-10-27 00:25:46 +00003628#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003629}
3630
Martin v. Löwis011e8422009-05-05 04:43:17 +00003631
3632int
Antoine Pitrou13348842012-01-29 18:36:34 +01003633_PyUnicode_HasNULChars(PyObject* s)
3634{
3635 static PyObject *nul = NULL;
3636
3637 if (nul == NULL)
3638 nul = PyUnicode_FromStringAndSize("\0", 1);
3639 if (nul == NULL)
3640 return -1;
3641 return PyUnicode_Contains(s, nul);
3642}
3643
3644
3645int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003646PyUnicode_FSConverter(PyObject* arg, void* addr)
3647{
3648 PyObject *output = NULL;
3649 Py_ssize_t size;
3650 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003651 if (arg == NULL) {
3652 Py_DECREF(*(PyObject**)addr);
3653 return 1;
3654 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003655 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003656 output = arg;
3657 Py_INCREF(output);
3658 }
3659 else {
3660 arg = PyUnicode_FromObject(arg);
3661 if (!arg)
3662 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003663 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003664 Py_DECREF(arg);
3665 if (!output)
3666 return 0;
3667 if (!PyBytes_Check(output)) {
3668 Py_DECREF(output);
3669 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3670 return 0;
3671 }
3672 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003673 size = PyBytes_GET_SIZE(output);
3674 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003675 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003676 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003677 Py_DECREF(output);
3678 return 0;
3679 }
3680 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003681 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003682}
3683
3684
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003685int
3686PyUnicode_FSDecoder(PyObject* arg, void* addr)
3687{
3688 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003689 if (arg == NULL) {
3690 Py_DECREF(*(PyObject**)addr);
3691 return 1;
3692 }
3693 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003694 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003695 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003696 output = arg;
3697 Py_INCREF(output);
3698 }
3699 else {
3700 arg = PyBytes_FromObject(arg);
3701 if (!arg)
3702 return 0;
3703 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3704 PyBytes_GET_SIZE(arg));
3705 Py_DECREF(arg);
3706 if (!output)
3707 return 0;
3708 if (!PyUnicode_Check(output)) {
3709 Py_DECREF(output);
3710 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3711 return 0;
3712 }
3713 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003714 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003715 Py_DECREF(output);
3716 return 0;
3717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003719 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003720 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3721 Py_DECREF(output);
3722 return 0;
3723 }
3724 *(PyObject**)addr = output;
3725 return Py_CLEANUP_SUPPORTED;
3726}
3727
3728
Martin v. Löwis5b222132007-06-10 09:51:05 +00003729char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003730PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003731{
Christian Heimesf3863112007-11-22 07:46:41 +00003732 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003734 if (!PyUnicode_Check(unicode)) {
3735 PyErr_BadArgument();
3736 return NULL;
3737 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003738 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003739 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003741 if (PyUnicode_UTF8(unicode) == NULL) {
3742 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3744 if (bytes == NULL)
3745 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003746 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3747 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003748 Py_DECREF(bytes);
3749 return NULL;
3750 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003751 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3752 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3753 PyBytes_AS_STRING(bytes),
3754 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755 Py_DECREF(bytes);
3756 }
3757
3758 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003759 *psize = PyUnicode_UTF8_LENGTH(unicode);
3760 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003761}
3762
3763char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003765{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3767}
3768
3769#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003770static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771#endif
3772
3773
3774Py_UNICODE *
3775PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 const unsigned char *one_byte;
3778#if SIZEOF_WCHAR_T == 4
3779 const Py_UCS2 *two_bytes;
3780#else
3781 const Py_UCS4 *four_bytes;
3782 const Py_UCS4 *ucs4_end;
3783 Py_ssize_t num_surrogates;
3784#endif
3785 wchar_t *w;
3786 wchar_t *wchar_end;
3787
3788 if (!PyUnicode_Check(unicode)) {
3789 PyErr_BadArgument();
3790 return NULL;
3791 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003792 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003794 assert(_PyUnicode_KIND(unicode) != 0);
3795 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796
3797#ifdef Py_DEBUG
3798 ++unicode_as_unicode_calls;
3799#endif
3800
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003801 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003802#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003803 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3804 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805 num_surrogates = 0;
3806
3807 for (; four_bytes < ucs4_end; ++four_bytes) {
3808 if (*four_bytes > 0xFFFF)
3809 ++num_surrogates;
3810 }
3811
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003812 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3813 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3814 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 PyErr_NoMemory();
3816 return NULL;
3817 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003820 w = _PyUnicode_WSTR(unicode);
3821 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3822 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3824 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003825 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003827 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3828 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 }
3830 else
3831 *w = *four_bytes;
3832
3833 if (w > wchar_end) {
3834 assert(0 && "Miscalculated string end");
3835 }
3836 }
3837 *w = 0;
3838#else
3839 /* sizeof(wchar_t) == 4 */
3840 Py_FatalError("Impossible unicode object state, wstr and str "
3841 "should share memory already.");
3842 return NULL;
3843#endif
3844 }
3845 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003846 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3847 (_PyUnicode_LENGTH(unicode) + 1));
3848 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 PyErr_NoMemory();
3850 return NULL;
3851 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003852 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3853 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3854 w = _PyUnicode_WSTR(unicode);
3855 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003857 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3858 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 for (; w < wchar_end; ++one_byte, ++w)
3860 *w = *one_byte;
3861 /* null-terminate the wstr */
3862 *w = 0;
3863 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003864 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 for (; w < wchar_end; ++two_bytes, ++w)
3868 *w = *two_bytes;
3869 /* null-terminate the wstr */
3870 *w = 0;
3871#else
3872 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003873 PyObject_FREE(_PyUnicode_WSTR(unicode));
3874 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 Py_FatalError("Impossible unicode object state, wstr "
3876 "and str should share memory already.");
3877 return NULL;
3878#endif
3879 }
3880 else {
3881 assert(0 && "This should never happen.");
3882 }
3883 }
3884 }
3885 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003886 *size = PyUnicode_WSTR_LENGTH(unicode);
3887 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003888}
3889
Alexander Belopolsky40018472011-02-26 01:02:56 +00003890Py_UNICODE *
3891PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894}
3895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896
Alexander Belopolsky40018472011-02-26 01:02:56 +00003897Py_ssize_t
3898PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003899{
3900 if (!PyUnicode_Check(unicode)) {
3901 PyErr_BadArgument();
3902 goto onError;
3903 }
3904 return PyUnicode_GET_SIZE(unicode);
3905
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 return -1;
3908}
3909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910Py_ssize_t
3911PyUnicode_GetLength(PyObject *unicode)
3912{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003913 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914 PyErr_BadArgument();
3915 return -1;
3916 }
3917
3918 return PyUnicode_GET_LENGTH(unicode);
3919}
3920
3921Py_UCS4
3922PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3923{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003924 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3925 PyErr_BadArgument();
3926 return (Py_UCS4)-1;
3927 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003928 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003929 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003930 return (Py_UCS4)-1;
3931 }
3932 return PyUnicode_READ_CHAR(unicode, index);
3933}
3934
3935int
3936PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3937{
3938 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003939 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940 return -1;
3941 }
Victor Stinner488fa492011-12-12 00:01:39 +01003942 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003943 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003944 PyErr_SetString(PyExc_IndexError, "string index out of range");
3945 return -1;
3946 }
Victor Stinner488fa492011-12-12 00:01:39 +01003947 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003948 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003949 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3950 PyErr_SetString(PyExc_ValueError, "character out of range");
3951 return -1;
3952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3954 index, ch);
3955 return 0;
3956}
3957
Alexander Belopolsky40018472011-02-26 01:02:56 +00003958const char *
3959PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003960{
Victor Stinner42cb4622010-09-01 19:39:01 +00003961 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003962}
3963
Victor Stinner554f3f02010-06-16 23:33:54 +00003964/* create or adjust a UnicodeDecodeError */
3965static void
3966make_decode_exception(PyObject **exceptionObject,
3967 const char *encoding,
3968 const char *input, Py_ssize_t length,
3969 Py_ssize_t startpos, Py_ssize_t endpos,
3970 const char *reason)
3971{
3972 if (*exceptionObject == NULL) {
3973 *exceptionObject = PyUnicodeDecodeError_Create(
3974 encoding, input, length, startpos, endpos, reason);
3975 }
3976 else {
3977 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3978 goto onError;
3979 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3980 goto onError;
3981 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3982 goto onError;
3983 }
3984 return;
3985
3986onError:
3987 Py_DECREF(*exceptionObject);
3988 *exceptionObject = NULL;
3989}
3990
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991/* error handling callback helper:
3992 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003993 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 and adjust various state variables.
3995 return 0 on success, -1 on error
3996*/
3997
Alexander Belopolsky40018472011-02-26 01:02:56 +00003998static int
3999unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004000 const char *encoding, const char *reason,
4001 const char **input, const char **inend, Py_ssize_t *startinpos,
4002 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004003 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004005 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004006
4007 PyObject *restuple = NULL;
4008 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004009 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004010 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004011 Py_ssize_t requiredsize;
4012 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004013 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 int res = -1;
4015
Victor Stinner596a6c42011-11-09 00:02:18 +01004016 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4017 outsize = PyUnicode_GET_LENGTH(*output);
4018 else
4019 outsize = _PyUnicode_WSTR_LENGTH(*output);
4020
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004022 *errorHandler = PyCodec_LookupError(errors);
4023 if (*errorHandler == NULL)
4024 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 }
4026
Victor Stinner554f3f02010-06-16 23:33:54 +00004027 make_decode_exception(exceptionObject,
4028 encoding,
4029 *input, *inend - *input,
4030 *startinpos, *endinpos,
4031 reason);
4032 if (*exceptionObject == NULL)
4033 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034
4035 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4036 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004039 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 }
4042 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004044 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004045 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004046
4047 /* Copy back the bytes variables, which might have been modified by the
4048 callback */
4049 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4050 if (!inputobj)
4051 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004052 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004054 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004055 *input = PyBytes_AS_STRING(inputobj);
4056 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004057 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004058 /* we can DECREF safely, as the exception has another reference,
4059 so the object won't go away. */
4060 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004064 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4066 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004067 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068
Victor Stinner596a6c42011-11-09 00:02:18 +01004069 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4070 /* need more space? (at least enough for what we
4071 have+the replacement+the rest of the string (starting
4072 at the new input position), so we won't have to check space
4073 when there are no errors in the rest of the string) */
4074 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4075 requiredsize = *outpos + replen + insize-newpos;
4076 if (requiredsize > outsize) {
4077 if (requiredsize<2*outsize)
4078 requiredsize = 2*outsize;
4079 if (unicode_resize(output, requiredsize) < 0)
4080 goto onError;
4081 }
4082 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004083 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004084 copy_characters(*output, *outpos, repunicode, 0, replen);
4085 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004087 else {
4088 wchar_t *repwstr;
4089 Py_ssize_t repwlen;
4090 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4091 if (repwstr == NULL)
4092 goto onError;
4093 /* need more space? (at least enough for what we
4094 have+the replacement+the rest of the string (starting
4095 at the new input position), so we won't have to check space
4096 when there are no errors in the rest of the string) */
4097 requiredsize = *outpos + repwlen + insize-newpos;
4098 if (requiredsize > outsize) {
4099 if (requiredsize < 2*outsize)
4100 requiredsize = 2*outsize;
4101 if (unicode_resize(output, requiredsize) < 0)
4102 goto onError;
4103 }
4104 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4105 *outpos += repwlen;
4106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004108 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 /* we made it! */
4111 res = 0;
4112
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 Py_XDECREF(restuple);
4115 return res;
4116}
4117
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004118/* --- UTF-7 Codec -------------------------------------------------------- */
4119
Antoine Pitrou244651a2009-05-04 18:56:13 +00004120/* See RFC2152 for details. We encode conservatively and decode liberally. */
4121
4122/* Three simple macros defining base-64. */
4123
4124/* Is c a base-64 character? */
4125
4126#define IS_BASE64(c) \
4127 (((c) >= 'A' && (c) <= 'Z') || \
4128 ((c) >= 'a' && (c) <= 'z') || \
4129 ((c) >= '0' && (c) <= '9') || \
4130 (c) == '+' || (c) == '/')
4131
4132/* given that c is a base-64 character, what is its base-64 value? */
4133
4134#define FROM_BASE64(c) \
4135 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4136 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4137 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4138 (c) == '+' ? 62 : 63)
4139
4140/* What is the base-64 character of the bottom 6 bits of n? */
4141
4142#define TO_BASE64(n) \
4143 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4144
4145/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4146 * decoded as itself. We are permissive on decoding; the only ASCII
4147 * byte not decoding to itself is the + which begins a base64
4148 * string. */
4149
4150#define DECODE_DIRECT(c) \
4151 ((c) <= 127 && (c) != '+')
4152
4153/* The UTF-7 encoder treats ASCII characters differently according to
4154 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4155 * the above). See RFC2152. This array identifies these different
4156 * sets:
4157 * 0 : "Set D"
4158 * alphanumeric and '(),-./:?
4159 * 1 : "Set O"
4160 * !"#$%&*;<=>@[]^_`{|}
4161 * 2 : "whitespace"
4162 * ht nl cr sp
4163 * 3 : special (must be base64 encoded)
4164 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4165 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004166
Tim Petersced69f82003-09-16 20:30:58 +00004167static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004168char utf7_category[128] = {
4169/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4170 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4171/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4172 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4173/* sp ! " # $ % & ' ( ) * + , - . / */
4174 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4175/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4177/* @ A B C D E F G H I J K L M N O */
4178 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4179/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4181/* ` a b c d e f g h i j k l m n o */
4182 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4183/* p q r s t u v w x y z { | } ~ del */
4184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004185};
4186
Antoine Pitrou244651a2009-05-04 18:56:13 +00004187/* ENCODE_DIRECT: this character should be encoded as itself. The
4188 * answer depends on whether we are encoding set O as itself, and also
4189 * on whether we are encoding whitespace as itself. RFC2152 makes it
4190 * clear that the answers to these questions vary between
4191 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004192
Antoine Pitrou244651a2009-05-04 18:56:13 +00004193#define ENCODE_DIRECT(c, directO, directWS) \
4194 ((c) < 128 && (c) > 0 && \
4195 ((utf7_category[(c)] == 0) || \
4196 (directWS && (utf7_category[(c)] == 2)) || \
4197 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004198
Alexander Belopolsky40018472011-02-26 01:02:56 +00004199PyObject *
4200PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004201 Py_ssize_t size,
4202 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004203{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004204 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4205}
4206
Antoine Pitrou244651a2009-05-04 18:56:13 +00004207/* The decoder. The only state we preserve is our read position,
4208 * i.e. how many characters we have consumed. So if we end in the
4209 * middle of a shift sequence we have to back off the read position
4210 * and the output to the beginning of the sequence, otherwise we lose
4211 * all the shift state (seen bits, number of bits seen, high
4212 * surrogate). */
4213
Alexander Belopolsky40018472011-02-26 01:02:56 +00004214PyObject *
4215PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004216 Py_ssize_t size,
4217 const char *errors,
4218 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004219{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004221 Py_ssize_t startinpos;
4222 Py_ssize_t endinpos;
4223 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004224 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004225 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004226 const char *errmsg = "";
4227 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004228 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004229 unsigned int base64bits = 0;
4230 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004231 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 PyObject *errorHandler = NULL;
4233 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004234
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004235 /* Start off assuming it's all ASCII. Widen later as necessary. */
4236 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004237 if (!unicode)
4238 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004239 if (size == 0) {
4240 if (consumed)
4241 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004242 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004243 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004244
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004245 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004246 e = s + size;
4247
4248 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004249 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004251 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004252
Antoine Pitrou244651a2009-05-04 18:56:13 +00004253 if (inShift) { /* in a base-64 section */
4254 if (IS_BASE64(ch)) { /* consume a base-64 character */
4255 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4256 base64bits += 6;
4257 s++;
4258 if (base64bits >= 16) {
4259 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004260 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004261 base64bits -= 16;
4262 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4263 if (surrogate) {
4264 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004265 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4266 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004267 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4268 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004270 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004271 }
4272 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004273 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4274 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276 }
4277 }
Victor Stinner551ac952011-11-29 22:58:13 +01004278 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004279 /* first surrogate */
4280 surrogate = outCh;
4281 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004283 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4284 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004285 }
4286 }
4287 }
4288 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004289 inShift = 0;
4290 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004291 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004292 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4293 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004294 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004295 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004296 if (base64bits > 0) { /* left-over bits */
4297 if (base64bits >= 6) {
4298 /* We've seen at least one base-64 character */
4299 errmsg = "partial character in shift sequence";
4300 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004301 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004302 else {
4303 /* Some bits remain; they should be zero */
4304 if (base64buffer != 0) {
4305 errmsg = "non-zero padding bits in shift sequence";
4306 goto utf7Error;
4307 }
4308 }
4309 }
4310 if (ch != '-') {
4311 /* '-' is absorbed; other terminating
4312 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004313 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4314 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004315 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316 }
4317 }
4318 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320 s++; /* consume '+' */
4321 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004323 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4324 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 }
4326 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004328 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004329 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330 }
4331 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004332 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4334 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004335 s++;
4336 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 else {
4338 startinpos = s-starts;
4339 s++;
4340 errmsg = "unexpected special character";
4341 goto utf7Error;
4342 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 endinpos = s-starts;
4346 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 errors, &errorHandler,
4348 "utf7", errmsg,
4349 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004350 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004351 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004352 }
4353
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 /* end of string */
4355
4356 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4357 /* if we're in an inconsistent state, that's an error */
4358 if (surrogate ||
4359 (base64bits >= 6) ||
4360 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 endinpos = size;
4362 if (unicode_decode_call_errorhandler(
4363 errors, &errorHandler,
4364 "utf7", "unterminated shift sequence",
4365 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004366 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 goto onError;
4368 if (s < e)
4369 goto restart;
4370 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372
4373 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004374 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004376 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004377 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004378 }
4379 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004380 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004382 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004384 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 goto onError;
4386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 Py_XDECREF(errorHandler);
4388 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004389 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392 Py_XDECREF(errorHandler);
4393 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004394 Py_DECREF(unicode);
4395 return NULL;
4396}
4397
4398
Alexander Belopolsky40018472011-02-26 01:02:56 +00004399PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004400_PyUnicode_EncodeUTF7(PyObject *str,
4401 int base64SetO,
4402 int base64WhiteSpace,
4403 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004405 int kind;
4406 void *data;
4407 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004408 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004409 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004411 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 unsigned int base64bits = 0;
4413 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 char * out;
4415 char * start;
4416
Benjamin Petersonbac79492012-01-14 13:34:47 -05004417 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004418 return NULL;
4419 kind = PyUnicode_KIND(str);
4420 data = PyUnicode_DATA(str);
4421 len = PyUnicode_GET_LENGTH(str);
4422
4423 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004424 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004426 /* It might be possible to tighten this worst case */
4427 allocated = 8 * len;
4428 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004429 return PyErr_NoMemory();
4430
Antoine Pitrou244651a2009-05-04 18:56:13 +00004431 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 if (v == NULL)
4433 return NULL;
4434
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004435 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004436 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004437 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 if (inShift) {
4440 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4441 /* shifting out */
4442 if (base64bits) { /* output remaining bits */
4443 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4444 base64buffer = 0;
4445 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004446 }
4447 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 /* Characters not in the BASE64 set implicitly unshift the sequence
4449 so no '-' is required, except if the character is itself a '-' */
4450 if (IS_BASE64(ch) || ch == '-') {
4451 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 *out++ = (char) ch;
4454 }
4455 else {
4456 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004457 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 else { /* not in a shift sequence */
4460 if (ch == '+') {
4461 *out++ = '+';
4462 *out++ = '-';
4463 }
4464 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4465 *out++ = (char) ch;
4466 }
4467 else {
4468 *out++ = '+';
4469 inShift = 1;
4470 goto encode_char;
4471 }
4472 }
4473 continue;
4474encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004476 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004477
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 /* code first surrogate */
4479 base64bits += 16;
4480 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4481 while (base64bits >= 6) {
4482 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4483 base64bits -= 6;
4484 }
4485 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004486 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 base64bits += 16;
4489 base64buffer = (base64buffer << 16) | ch;
4490 while (base64bits >= 6) {
4491 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4492 base64bits -= 6;
4493 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004494 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 if (base64bits)
4496 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4497 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004499 if (_PyBytes_Resize(&v, out - start) < 0)
4500 return NULL;
4501 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503PyObject *
4504PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4505 Py_ssize_t size,
4506 int base64SetO,
4507 int base64WhiteSpace,
4508 const char *errors)
4509{
4510 PyObject *result;
4511 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4512 if (tmp == NULL)
4513 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004514 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004515 base64WhiteSpace, errors);
4516 Py_DECREF(tmp);
4517 return result;
4518}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520#undef IS_BASE64
4521#undef FROM_BASE64
4522#undef TO_BASE64
4523#undef DECODE_DIRECT
4524#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004525
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526/* --- UTF-8 Codec -------------------------------------------------------- */
4527
Tim Petersced69f82003-09-16 20:30:58 +00004528static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004530 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4531 illegal prefix. See RFC 3629 for details */
4532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4544 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4545 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4546 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4547 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548};
4549
Alexander Belopolsky40018472011-02-26 01:02:56 +00004550PyObject *
4551PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004552 Py_ssize_t size,
4553 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554{
Walter Dörwald69652032004-09-07 20:24:22 +00004555 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4556}
4557
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004558#include "stringlib/ucs1lib.h"
4559#include "stringlib/codecs.h"
4560#include "stringlib/undef.h"
4561
4562#include "stringlib/ucs2lib.h"
4563#include "stringlib/codecs.h"
4564#include "stringlib/undef.h"
4565
4566#include "stringlib/ucs4lib.h"
4567#include "stringlib/codecs.h"
4568#include "stringlib/undef.h"
4569
Antoine Pitrouab868312009-01-10 15:40:25 +00004570/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4571#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4572
4573/* Mask to quickly check whether a C 'long' contains a
4574 non-ASCII, UTF8-encoded char. */
4575#if (SIZEOF_LONG == 8)
4576# define ASCII_CHAR_MASK 0x8080808080808080L
4577#elif (SIZEOF_LONG == 4)
4578# define ASCII_CHAR_MASK 0x80808080L
4579#else
4580# error C 'long' size should be either 4 or 8!
4581#endif
4582
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004583/* Scans a UTF-8 string and returns the maximum character to be expected
4584 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004585
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004586 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004587 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004588 */
4589static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004590utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004592 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004593 const unsigned char *end = p + string_size;
4594 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004595
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004596 assert(unicode_size != NULL);
4597
4598 /* By having a cascade of independent loops which fallback onto each
4599 other, we minimize the amount of work done in the average loop
4600 iteration, and we also maximize the CPU's ability to predict
4601 branches correctly (because a given condition will have always the
4602 same boolean outcome except perhaps in the last iteration of the
4603 corresponding loop).
4604 In the general case this brings us rather close to decoding
4605 performance pre-PEP 393, despite the two-pass decoding.
4606
4607 Note that the pure ASCII loop is not duplicated once a non-ASCII
4608 character has been encountered. It is actually a pessimization (by
4609 a significant factor) to use this loop on text with many non-ASCII
4610 characters, and it is important to avoid bad performance on valid
4611 utf-8 data (invalid utf-8 being a different can of worms).
4612 */
4613
4614 /* ASCII */
4615 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616 /* Only check value if it's not a ASCII char... */
4617 if (*p < 0x80) {
4618 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4619 an explanation. */
4620 if (!((size_t) p & LONG_PTR_MASK)) {
4621 /* Help register allocation */
4622 register const unsigned char *_p = p;
4623 while (_p < aligned_end) {
4624 unsigned long value = *(unsigned long *) _p;
4625 if (value & ASCII_CHAR_MASK)
4626 break;
4627 _p += SIZEOF_LONG;
4628 char_count += SIZEOF_LONG;
4629 }
4630 p = _p;
4631 if (p == end)
4632 break;
4633 }
4634 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004635 if (*p < 0x80)
4636 ++char_count;
4637 else
4638 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004639 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004640 *unicode_size = char_count;
4641 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004642
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004643_ucs1loop:
4644 for (; p < end; ++p) {
4645 if (*p < 0xc4)
4646 char_count += ((*p & 0xc0) != 0x80);
4647 else
4648 goto _ucs2loop;
4649 }
4650 *unicode_size = char_count;
4651 return 255;
4652
4653_ucs2loop:
4654 for (; p < end; ++p) {
4655 if (*p < 0xf0)
4656 char_count += ((*p & 0xc0) != 0x80);
4657 else
4658 goto _ucs4loop;
4659 }
4660 *unicode_size = char_count;
4661 return 65535;
4662
4663_ucs4loop:
4664 for (; p < end; ++p) {
4665 char_count += ((*p & 0xc0) != 0x80);
4666 }
4667 *unicode_size = char_count;
4668 return 65537;
4669}
4670
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004671/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004672 in case of errors. Implicit parameters: unicode, kind, data, onError.
4673 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004674*/
Victor Stinner785938e2011-12-11 20:09:03 +01004675#define WRITE_MAYBE_FAIL(index, value) \
4676 do { \
4677 Py_ssize_t pos = index; \
4678 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4679 unicode_resize(&unicode, pos + pos/8) < 0) \
4680 goto onError; \
4681 if (unicode_putchar(&unicode, &pos, value) < 0) \
4682 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004683 } while (0)
4684
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004685static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004686decode_utf8_errors(const char *starts,
4687 Py_ssize_t size,
4688 const char *errors,
4689 Py_ssize_t *consumed,
4690 const char *s,
4691 PyObject *unicode,
4692 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004693{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004695 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004696 Py_ssize_t startinpos;
4697 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004698 const char *e = starts + size;
4699 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004700 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 PyObject *errorHandler = NULL;
4702 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004703
Antoine Pitrouab868312009-01-10 15:40:25 +00004704 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705
4706 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004707 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708
4709 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004710 /* Fast path for runs of ASCII characters. Given that common UTF-8
4711 input will consist of an overwhelming majority of ASCII
4712 characters, we try to optimize for this case by checking
4713 as many characters as a C 'long' can contain.
4714 First, check if we can do an aligned read, as most CPUs have
4715 a penalty for unaligned reads.
4716 */
4717 if (!((size_t) s & LONG_PTR_MASK)) {
4718 /* Help register allocation */
4719 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004720 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004721 while (_s < aligned_end) {
4722 /* Read a whole long at a time (either 4 or 8 bytes),
4723 and do a fast unrolled copy if it only contains ASCII
4724 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004725 unsigned long value = *(unsigned long *) _s;
4726 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004727 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004728 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4729 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4730 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4731 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004732#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004733 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4734 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4735 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4736 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004737#endif
4738 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004739 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004740 }
4741 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004742 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004743 if (s == e)
4744 break;
4745 ch = (unsigned char)*s;
4746 }
4747 }
4748
4749 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004750 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751 s++;
4752 continue;
4753 }
4754
4755 n = utf8_code_length[ch];
4756
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004757 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 if (consumed)
4759 break;
4760 else {
4761 errmsg = "unexpected end of data";
4762 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004763 endinpos = startinpos+1;
4764 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4765 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 goto utf8Error;
4767 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769
4770 switch (n) {
4771
4772 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004773 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 startinpos = s-starts;
4775 endinpos = startinpos+1;
4776 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777
4778 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004779 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 startinpos = s-starts;
4781 endinpos = startinpos+1;
4782 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783
4784 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004785 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004786 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004788 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 goto utf8Error;
4790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004792 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004793 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 break;
4795
4796 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004797 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4798 will result in surrogates in range d800-dfff. Surrogates are
4799 not valid UTF-8 so they are rejected.
4800 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4801 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004802 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004803 (s[2] & 0xc0) != 0x80 ||
4804 ((unsigned char)s[0] == 0xE0 &&
4805 (unsigned char)s[1] < 0xA0) ||
4806 ((unsigned char)s[0] == 0xED &&
4807 (unsigned char)s[1] > 0x9F)) {
4808 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004810 endinpos = startinpos + 1;
4811
4812 /* if s[1] first two bits are 1 and 0, then the invalid
4813 continuation byte is s[2], so increment endinpos by 1,
4814 if not, s[1] is invalid and endinpos doesn't need to
4815 be incremented. */
4816 if ((s[1] & 0xC0) == 0x80)
4817 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 goto utf8Error;
4819 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004821 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004822 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004823 break;
4824
4825 case 4:
4826 if ((s[1] & 0xc0) != 0x80 ||
4827 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004828 (s[3] & 0xc0) != 0x80 ||
4829 ((unsigned char)s[0] == 0xF0 &&
4830 (unsigned char)s[1] < 0x90) ||
4831 ((unsigned char)s[0] == 0xF4 &&
4832 (unsigned char)s[1] > 0x8F)) {
4833 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004835 endinpos = startinpos + 1;
4836 if ((s[1] & 0xC0) == 0x80) {
4837 endinpos++;
4838 if ((s[2] & 0xC0) == 0x80)
4839 endinpos++;
4840 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004841 goto utf8Error;
4842 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004843 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004844 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004845 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004846
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004847 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 }
4850 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004852
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 if (unicode_decode_call_errorhandler(
4855 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004856 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004858 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004859 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004860 /* Update data because unicode_decode_call_errorhandler might have
4861 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 }
Walter Dörwald69652032004-09-07 20:24:22 +00004864 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004865 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004867 /* Adjust length and ready string when it contained errors and
4868 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004869 if (unicode_resize(&unicode, i) < 0)
4870 goto onError;
4871 unicode_adjust_maxchar(&unicode);
4872 if (unicode == NULL)
4873 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875 Py_XDECREF(errorHandler);
4876 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004877 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004878 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004881 Py_XDECREF(errorHandler);
4882 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004883 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 return NULL;
4885}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004886#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004887
Victor Stinner785938e2011-12-11 20:09:03 +01004888PyObject *
4889PyUnicode_DecodeUTF8Stateful(const char *s,
4890 Py_ssize_t size,
4891 const char *errors,
4892 Py_ssize_t *consumed)
4893{
4894 Py_UCS4 maxchar = 0;
4895 Py_ssize_t unicode_size;
4896 int has_errors = 0;
4897 PyObject *unicode;
4898 int kind;
4899 void *data;
4900 const char *starts = s;
4901 const char *e;
4902 Py_ssize_t i;
4903
4904 if (size == 0) {
4905 if (consumed)
4906 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004907 Py_INCREF(unicode_empty);
4908 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004909 }
4910
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004911 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004912
4913 /* When the string is ASCII only, just use memcpy and return.
4914 unicode_size may be != size if there is an incomplete UTF-8
4915 sequence at the end of the ASCII block. */
4916 if (maxchar < 128 && size == unicode_size) {
4917 if (consumed)
4918 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004919 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004920 }
4921
4922 unicode = PyUnicode_New(unicode_size, maxchar);
4923 if (!unicode)
4924 return NULL;
4925 kind = PyUnicode_KIND(unicode);
4926 data = PyUnicode_DATA(unicode);
4927
4928 /* Unpack UTF-8 encoded data */
4929 i = 0;
4930 e = starts + size;
4931 switch (kind) {
4932 case PyUnicode_1BYTE_KIND:
4933 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4934 break;
4935 case PyUnicode_2BYTE_KIND:
4936 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4937 break;
4938 case PyUnicode_4BYTE_KIND:
4939 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4940 break;
4941 }
4942 if (!has_errors) {
4943 /* Ensure the unicode size calculation was correct */
4944 assert(i == unicode_size);
4945 assert(s == e);
4946 if (consumed)
4947 *consumed = size;
4948 return unicode;
4949 }
4950
4951 /* In case of errors, maxchar and size computation might be incorrect;
4952 code below refits and resizes as necessary. */
4953 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4954}
4955
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004956#ifdef __APPLE__
4957
4958/* Simplified UTF-8 decoder using surrogateescape error handler,
4959 used to decode the command line arguments on Mac OS X. */
4960
4961wchar_t*
4962_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4963{
4964 int n;
4965 const char *e;
4966 wchar_t *unicode, *p;
4967
4968 /* Note: size will always be longer than the resulting Unicode
4969 character count */
4970 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4971 PyErr_NoMemory();
4972 return NULL;
4973 }
4974 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4975 if (!unicode)
4976 return NULL;
4977
4978 /* Unpack UTF-8 encoded data */
4979 p = unicode;
4980 e = s + size;
4981 while (s < e) {
4982 Py_UCS4 ch = (unsigned char)*s;
4983
4984 if (ch < 0x80) {
4985 *p++ = (wchar_t)ch;
4986 s++;
4987 continue;
4988 }
4989
4990 n = utf8_code_length[ch];
4991 if (s + n > e) {
4992 goto surrogateescape;
4993 }
4994
4995 switch (n) {
4996 case 0:
4997 case 1:
4998 goto surrogateescape;
4999
5000 case 2:
5001 if ((s[1] & 0xc0) != 0x80)
5002 goto surrogateescape;
5003 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5004 assert ((ch > 0x007F) && (ch <= 0x07FF));
5005 *p++ = (wchar_t)ch;
5006 break;
5007
5008 case 3:
5009 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5010 will result in surrogates in range d800-dfff. Surrogates are
5011 not valid UTF-8 so they are rejected.
5012 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5013 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5014 if ((s[1] & 0xc0) != 0x80 ||
5015 (s[2] & 0xc0) != 0x80 ||
5016 ((unsigned char)s[0] == 0xE0 &&
5017 (unsigned char)s[1] < 0xA0) ||
5018 ((unsigned char)s[0] == 0xED &&
5019 (unsigned char)s[1] > 0x9F)) {
5020
5021 goto surrogateescape;
5022 }
5023 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5024 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005025 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005026 break;
5027
5028 case 4:
5029 if ((s[1] & 0xc0) != 0x80 ||
5030 (s[2] & 0xc0) != 0x80 ||
5031 (s[3] & 0xc0) != 0x80 ||
5032 ((unsigned char)s[0] == 0xF0 &&
5033 (unsigned char)s[1] < 0x90) ||
5034 ((unsigned char)s[0] == 0xF4 &&
5035 (unsigned char)s[1] > 0x8F)) {
5036 goto surrogateescape;
5037 }
5038 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5039 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005040 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005041
5042#if SIZEOF_WCHAR_T == 4
5043 *p++ = (wchar_t)ch;
5044#else
5045 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005046 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5047 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005048#endif
5049 break;
5050 }
5051 s += n;
5052 continue;
5053
5054 surrogateescape:
5055 *p++ = 0xDC00 + ch;
5056 s++;
5057 }
5058 *p = L'\0';
5059 return unicode;
5060}
5061
5062#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005064/* Primary internal function which creates utf8 encoded bytes objects.
5065
5066 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005067 and allocate exactly as much space needed at the end. Else allocate the
5068 maximum possible needed (4 result bytes per Unicode character), and return
5069 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005070*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005071PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005072_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073{
Victor Stinner6099a032011-12-18 14:22:26 +01005074 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005075 void *data;
5076 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005078 if (!PyUnicode_Check(unicode)) {
5079 PyErr_BadArgument();
5080 return NULL;
5081 }
5082
5083 if (PyUnicode_READY(unicode) == -1)
5084 return NULL;
5085
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005086 if (PyUnicode_UTF8(unicode))
5087 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5088 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005089
5090 kind = PyUnicode_KIND(unicode);
5091 data = PyUnicode_DATA(unicode);
5092 size = PyUnicode_GET_LENGTH(unicode);
5093
Benjamin Petersonead6b532011-12-20 17:23:42 -06005094 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005095 default:
5096 assert(0);
5097 case PyUnicode_1BYTE_KIND:
5098 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5099 assert(!PyUnicode_IS_ASCII(unicode));
5100 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5101 case PyUnicode_2BYTE_KIND:
5102 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5103 case PyUnicode_4BYTE_KIND:
5104 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106}
5107
Alexander Belopolsky40018472011-02-26 01:02:56 +00005108PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5110 Py_ssize_t size,
5111 const char *errors)
5112{
5113 PyObject *v, *unicode;
5114
5115 unicode = PyUnicode_FromUnicode(s, size);
5116 if (unicode == NULL)
5117 return NULL;
5118 v = _PyUnicode_AsUTF8String(unicode, errors);
5119 Py_DECREF(unicode);
5120 return v;
5121}
5122
5123PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005124PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127}
5128
Walter Dörwald41980ca2007-08-16 21:55:45 +00005129/* --- UTF-32 Codec ------------------------------------------------------- */
5130
5131PyObject *
5132PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 Py_ssize_t size,
5134 const char *errors,
5135 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005136{
5137 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5138}
5139
5140PyObject *
5141PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 Py_ssize_t size,
5143 const char *errors,
5144 int *byteorder,
5145 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146{
5147 const char *starts = s;
5148 Py_ssize_t startinpos;
5149 Py_ssize_t endinpos;
5150 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005151 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005152 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005153 int bo = 0; /* assume native ordering by default */
5154 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005155 /* Offsets from q for retrieving bytes in the right order. */
5156#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5157 int iorder[] = {0, 1, 2, 3};
5158#else
5159 int iorder[] = {3, 2, 1, 0};
5160#endif
5161 PyObject *errorHandler = NULL;
5162 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005163
Walter Dörwald41980ca2007-08-16 21:55:45 +00005164 q = (unsigned char *)s;
5165 e = q + size;
5166
5167 if (byteorder)
5168 bo = *byteorder;
5169
5170 /* Check for BOM marks (U+FEFF) in the input and adjust current
5171 byte order setting accordingly. In native mode, the leading BOM
5172 mark is skipped, in all other modes, it is copied to the output
5173 stream as-is (giving a ZWNBSP character). */
5174 if (bo == 0) {
5175 if (size >= 4) {
5176 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005177 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005178#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 if (bom == 0x0000FEFF) {
5180 q += 4;
5181 bo = -1;
5182 }
5183 else if (bom == 0xFFFE0000) {
5184 q += 4;
5185 bo = 1;
5186 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005187#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 if (bom == 0x0000FEFF) {
5189 q += 4;
5190 bo = 1;
5191 }
5192 else if (bom == 0xFFFE0000) {
5193 q += 4;
5194 bo = -1;
5195 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005196#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005198 }
5199
5200 if (bo == -1) {
5201 /* force LE */
5202 iorder[0] = 0;
5203 iorder[1] = 1;
5204 iorder[2] = 2;
5205 iorder[3] = 3;
5206 }
5207 else if (bo == 1) {
5208 /* force BE */
5209 iorder[0] = 3;
5210 iorder[1] = 2;
5211 iorder[2] = 1;
5212 iorder[3] = 0;
5213 }
5214
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005215 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005216 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005217 if (!unicode)
5218 return NULL;
5219 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005220 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005221 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005222
Walter Dörwald41980ca2007-08-16 21:55:45 +00005223 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 Py_UCS4 ch;
5225 /* remaining bytes at the end? (size should be divisible by 4) */
5226 if (e-q<4) {
5227 if (consumed)
5228 break;
5229 errmsg = "truncated data";
5230 startinpos = ((const char *)q)-starts;
5231 endinpos = ((const char *)e)-starts;
5232 goto utf32Error;
5233 /* The remaining input chars are ignored if the callback
5234 chooses to skip the input */
5235 }
5236 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5237 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 if (ch >= 0x110000)
5240 {
5241 errmsg = "codepoint not in range(0x110000)";
5242 startinpos = ((const char *)q)-starts;
5243 endinpos = startinpos+4;
5244 goto utf32Error;
5245 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005246 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5247 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 q += 4;
5249 continue;
5250 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 if (unicode_decode_call_errorhandler(
5252 errors, &errorHandler,
5253 "utf32", errmsg,
5254 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005255 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005257 }
5258
5259 if (byteorder)
5260 *byteorder = bo;
5261
5262 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264
5265 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005266 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267 goto onError;
5268
5269 Py_XDECREF(errorHandler);
5270 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005271 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005272
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274 Py_DECREF(unicode);
5275 Py_XDECREF(errorHandler);
5276 Py_XDECREF(exc);
5277 return NULL;
5278}
5279
5280PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005281_PyUnicode_EncodeUTF32(PyObject *str,
5282 const char *errors,
5283 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005284{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005285 int kind;
5286 void *data;
5287 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005288 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005289 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005290 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005291 /* Offsets from p for storing byte pairs in the right order. */
5292#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5293 int iorder[] = {0, 1, 2, 3};
5294#else
5295 int iorder[] = {3, 2, 1, 0};
5296#endif
5297
Benjamin Peterson29060642009-01-31 22:14:21 +00005298#define STORECHAR(CH) \
5299 do { \
5300 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5301 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5302 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5303 p[iorder[0]] = (CH) & 0xff; \
5304 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005305 } while(0)
5306
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005307 if (!PyUnicode_Check(str)) {
5308 PyErr_BadArgument();
5309 return NULL;
5310 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005311 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005312 return NULL;
5313 kind = PyUnicode_KIND(str);
5314 data = PyUnicode_DATA(str);
5315 len = PyUnicode_GET_LENGTH(str);
5316
5317 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005318 bytesize = nsize * 4;
5319 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005321 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005322 if (v == NULL)
5323 return NULL;
5324
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005325 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005326 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005328 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005329 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005330
5331 if (byteorder == -1) {
5332 /* force LE */
5333 iorder[0] = 0;
5334 iorder[1] = 1;
5335 iorder[2] = 2;
5336 iorder[3] = 3;
5337 }
5338 else if (byteorder == 1) {
5339 /* force BE */
5340 iorder[0] = 3;
5341 iorder[1] = 2;
5342 iorder[2] = 1;
5343 iorder[3] = 0;
5344 }
5345
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005346 for (i = 0; i < len; i++)
5347 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005348
5349 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005350 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005351#undef STORECHAR
5352}
5353
Alexander Belopolsky40018472011-02-26 01:02:56 +00005354PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005355PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5356 Py_ssize_t size,
5357 const char *errors,
5358 int byteorder)
5359{
5360 PyObject *result;
5361 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5362 if (tmp == NULL)
5363 return NULL;
5364 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5365 Py_DECREF(tmp);
5366 return result;
5367}
5368
5369PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005370PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371{
Victor Stinnerb960b342011-11-20 19:12:52 +01005372 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005373}
5374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375/* --- UTF-16 Codec ------------------------------------------------------- */
5376
Tim Peters772747b2001-08-09 22:21:55 +00005377PyObject *
5378PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 Py_ssize_t size,
5380 const char *errors,
5381 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382{
Walter Dörwald69652032004-09-07 20:24:22 +00005383 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5384}
5385
Antoine Pitrouab868312009-01-10 15:40:25 +00005386/* Two masks for fast checking of whether a C 'long' may contain
5387 UTF16-encoded surrogate characters. This is an efficient heuristic,
5388 assuming that non-surrogate characters with a code point >= 0x8000 are
5389 rare in most input.
5390 FAST_CHAR_MASK is used when the input is in native byte ordering,
5391 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005392*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005393#if (SIZEOF_LONG == 8)
5394# define FAST_CHAR_MASK 0x8000800080008000L
5395# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005396# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005397#elif (SIZEOF_LONG == 4)
5398# define FAST_CHAR_MASK 0x80008000L
5399# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005400# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005401#else
5402# error C 'long' size should be either 4 or 8!
5403#endif
5404
Walter Dörwald69652032004-09-07 20:24:22 +00005405PyObject *
5406PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 Py_ssize_t size,
5408 const char *errors,
5409 int *byteorder,
5410 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005411{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005412 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005413 Py_ssize_t startinpos;
5414 Py_ssize_t endinpos;
5415 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005416 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005417 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005418 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005419 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005420 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005421 /* Offsets from q for retrieving byte pairs in the right order. */
5422#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5423 int ihi = 1, ilo = 0;
5424#else
5425 int ihi = 0, ilo = 1;
5426#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427 PyObject *errorHandler = NULL;
5428 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429
5430 /* Note: size will always be longer than the resulting Unicode
5431 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005432 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 if (!unicode)
5434 return NULL;
5435 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005436 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005437 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
Tim Peters772747b2001-08-09 22:21:55 +00005439 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005440 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
5442 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005443 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005445 /* Check for BOM marks (U+FEFF) in the input and adjust current
5446 byte order setting accordingly. In native mode, the leading BOM
5447 mark is skipped, in all other modes, it is copied to the output
5448 stream as-is (giving a ZWNBSP character). */
5449 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005450 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005451 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005452#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 if (bom == 0xFEFF) {
5454 q += 2;
5455 bo = -1;
5456 }
5457 else if (bom == 0xFFFE) {
5458 q += 2;
5459 bo = 1;
5460 }
Tim Petersced69f82003-09-16 20:30:58 +00005461#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 if (bom == 0xFEFF) {
5463 q += 2;
5464 bo = 1;
5465 }
5466 else if (bom == 0xFFFE) {
5467 q += 2;
5468 bo = -1;
5469 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005470#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473
Tim Peters772747b2001-08-09 22:21:55 +00005474 if (bo == -1) {
5475 /* force LE */
5476 ihi = 1;
5477 ilo = 0;
5478 }
5479 else if (bo == 1) {
5480 /* force BE */
5481 ihi = 0;
5482 ilo = 1;
5483 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005484#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5485 native_ordering = ilo < ihi;
5486#else
5487 native_ordering = ilo > ihi;
5488#endif
Tim Peters772747b2001-08-09 22:21:55 +00005489
Antoine Pitrouab868312009-01-10 15:40:25 +00005490 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005491 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005492 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005493 /* First check for possible aligned read of a C 'long'. Unaligned
5494 reads are more expensive, better to defer to another iteration. */
5495 if (!((size_t) q & LONG_PTR_MASK)) {
5496 /* Fast path for runs of non-surrogate chars. */
5497 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005498 int kind = PyUnicode_KIND(unicode);
5499 void *data = PyUnicode_DATA(unicode);
5500 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005501 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005502 Py_UCS4 maxch;
5503 if (native_ordering) {
5504 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005505 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005506 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005507 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005508 else {
5509 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005510 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005511 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005512 block = ((block >> 8) & STRIPPED_MASK) |
5513 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005514 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005515 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005516#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005517 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
5518 maxch = Py_MAX(maxch, ch);
5519 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
5520 maxch = Py_MAX(maxch, ch);
5521 ch = (Py_UCS2)(block >> 48);
5522 maxch = Py_MAX(maxch, ch);
5523#else
5524 ch = (Py_UCS2)(block >> 16);
5525 maxch = Py_MAX(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005526#endif
5527 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5528 if (unicode_widen(&unicode, maxch) < 0)
5529 goto onError;
5530 kind = PyUnicode_KIND(unicode);
5531 data = PyUnicode_DATA(unicode);
5532 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005533#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5534 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005535#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005536 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5537 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5538 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5539#else
5540 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5541#endif
5542#else
5543#if SIZEOF_LONG == 8
5544 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5545 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5546 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5547#else
5548 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5549#endif
5550 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005551#endif
5552 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005553 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005554 q = _q;
5555 if (q >= e)
5556 break;
5557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559
Benjamin Peterson14339b62009-01-31 16:36:08 +00005560 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005561
Victor Stinner551ac952011-11-29 22:58:13 +01005562 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005563 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5564 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 continue;
5566 }
5567
5568 /* UTF-16 code pair: */
5569 if (q > e) {
5570 errmsg = "unexpected end of data";
5571 startinpos = (((const char *)q) - 2) - starts;
5572 endinpos = ((const char *)e) + 1 - starts;
5573 goto utf16Error;
5574 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005575 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5576 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005578 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005579 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005580 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005581 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 continue;
5583 }
5584 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005585 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 startinpos = (((const char *)q)-4)-starts;
5587 endinpos = startinpos+2;
5588 goto utf16Error;
5589 }
5590
Benjamin Peterson14339b62009-01-31 16:36:08 +00005591 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 errmsg = "illegal encoding";
5593 startinpos = (((const char *)q)-2)-starts;
5594 endinpos = startinpos+2;
5595 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005596
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005599 errors,
5600 &errorHandler,
5601 "utf16", errmsg,
5602 &starts,
5603 (const char **)&e,
5604 &startinpos,
5605 &endinpos,
5606 &exc,
5607 (const char **)&q,
5608 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005609 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005612 /* remaining byte at the end? (size should be even) */
5613 if (e == q) {
5614 if (!consumed) {
5615 errmsg = "truncated data";
5616 startinpos = ((const char *)q) - starts;
5617 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005618 if (unicode_decode_call_errorhandler(
5619 errors,
5620 &errorHandler,
5621 "utf16", errmsg,
5622 &starts,
5623 (const char **)&e,
5624 &startinpos,
5625 &endinpos,
5626 &exc,
5627 (const char **)&q,
5628 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005629 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005630 goto onError;
5631 /* The remaining input chars are ignored if the callback
5632 chooses to skip the input */
5633 }
5634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
5636 if (byteorder)
5637 *byteorder = bo;
5638
Walter Dörwald69652032004-09-07 20:24:22 +00005639 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005641
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005643 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 goto onError;
5645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 Py_XDECREF(errorHandler);
5647 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005648 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005652 Py_XDECREF(errorHandler);
5653 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 return NULL;
5655}
5656
Antoine Pitrouab868312009-01-10 15:40:25 +00005657#undef FAST_CHAR_MASK
5658#undef SWAPPED_FAST_CHAR_MASK
5659
Tim Peters772747b2001-08-09 22:21:55 +00005660PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005661_PyUnicode_EncodeUTF16(PyObject *str,
5662 const char *errors,
5663 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005665 int kind;
5666 void *data;
5667 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005668 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005669 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005670 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005671 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005672 /* Offsets from p for storing byte pairs in the right order. */
5673#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5674 int ihi = 1, ilo = 0;
5675#else
5676 int ihi = 0, ilo = 1;
5677#endif
5678
Benjamin Peterson29060642009-01-31 22:14:21 +00005679#define STORECHAR(CH) \
5680 do { \
5681 p[ihi] = ((CH) >> 8) & 0xff; \
5682 p[ilo] = (CH) & 0xff; \
5683 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005684 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005686 if (!PyUnicode_Check(str)) {
5687 PyErr_BadArgument();
5688 return NULL;
5689 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005690 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005691 return NULL;
5692 kind = PyUnicode_KIND(str);
5693 data = PyUnicode_DATA(str);
5694 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005695
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005696 pairs = 0;
5697 if (kind == PyUnicode_4BYTE_KIND)
5698 for (i = 0; i < len; i++)
5699 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5700 pairs++;
5701 /* 2 * (len + pairs + (byteorder == 0)) */
5702 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005704 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005705 bytesize = nsize * 2;
5706 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005708 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 if (v == NULL)
5710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005712 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005715 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005716 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005717
5718 if (byteorder == -1) {
5719 /* force LE */
5720 ihi = 1;
5721 ilo = 0;
5722 }
5723 else if (byteorder == 1) {
5724 /* force BE */
5725 ihi = 0;
5726 ilo = 1;
5727 }
5728
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729 for (i = 0; i < len; i++) {
5730 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5731 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005732 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005733 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5734 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 }
Tim Peters772747b2001-08-09 22:21:55 +00005736 STORECHAR(ch);
5737 if (ch2)
5738 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005739 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005740
5741 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005742 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005743#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744}
5745
Alexander Belopolsky40018472011-02-26 01:02:56 +00005746PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005747PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5748 Py_ssize_t size,
5749 const char *errors,
5750 int byteorder)
5751{
5752 PyObject *result;
5753 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5754 if (tmp == NULL)
5755 return NULL;
5756 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5757 Py_DECREF(tmp);
5758 return result;
5759}
5760
5761PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005762PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005764 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765}
5766
5767/* --- Unicode Escape Codec ----------------------------------------------- */
5768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005769/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5770 if all the escapes in the string make it still a valid ASCII string.
5771 Returns -1 if any escapes were found which cause the string to
5772 pop out of ASCII range. Otherwise returns the length of the
5773 required buffer to hold the string.
5774 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005775static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005776length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5777{
5778 const unsigned char *p = (const unsigned char *)s;
5779 const unsigned char *end = p + size;
5780 Py_ssize_t length = 0;
5781
5782 if (size < 0)
5783 return -1;
5784
5785 for (; p < end; ++p) {
5786 if (*p > 127) {
5787 /* Non-ASCII */
5788 return -1;
5789 }
5790 else if (*p != '\\') {
5791 /* Normal character */
5792 ++length;
5793 }
5794 else {
5795 /* Backslash-escape, check next char */
5796 ++p;
5797 /* Escape sequence reaches till end of string or
5798 non-ASCII follow-up. */
5799 if (p >= end || *p > 127)
5800 return -1;
5801 switch (*p) {
5802 case '\n':
5803 /* backslash + \n result in zero characters */
5804 break;
5805 case '\\': case '\'': case '\"':
5806 case 'b': case 'f': case 't':
5807 case 'n': case 'r': case 'v': case 'a':
5808 ++length;
5809 break;
5810 case '0': case '1': case '2': case '3':
5811 case '4': case '5': case '6': case '7':
5812 case 'x': case 'u': case 'U': case 'N':
5813 /* these do not guarantee ASCII characters */
5814 return -1;
5815 default:
5816 /* count the backslash + the other character */
5817 length += 2;
5818 }
5819 }
5820 }
5821 return length;
5822}
5823
Fredrik Lundh06d12682001-01-24 07:59:11 +00005824static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005825
Alexander Belopolsky40018472011-02-26 01:02:56 +00005826PyObject *
5827PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005828 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005829 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005832 Py_ssize_t startinpos;
5833 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005834 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005835 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005837 char* message;
5838 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 PyObject *errorHandler = NULL;
5840 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005841 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005842 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005843
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005844 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005845
5846 /* After length_of_escaped_ascii_string() there are two alternatives,
5847 either the string is pure ASCII with named escapes like \n, etc.
5848 and we determined it's exact size (common case)
5849 or it contains \x, \u, ... escape sequences. then we create a
5850 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005851 if (len >= 0) {
5852 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005853 if (!v)
5854 goto onError;
5855 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005856 }
5857 else {
5858 /* Escaped strings will always be longer than the resulting
5859 Unicode string, so we start with size here and then reduce the
5860 length after conversion to the true value.
5861 (but if the error callback returns a long replacement string
5862 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005863 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005864 if (!v)
5865 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005866 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005867 }
5868
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005870 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005871 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005873
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 while (s < end) {
5875 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005876 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005879 /* The only case in which i == ascii_length is a backslash
5880 followed by a newline. */
5881 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005882
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 /* Non-escape characters are interpreted as Unicode ordinals */
5884 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005885 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5886 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 continue;
5888 }
5889
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 /* \ - Escapes */
5892 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005893 c = *s++;
5894 if (s > end)
5895 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005896
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005897 /* The only case in which i == ascii_length is a backslash
5898 followed by a newline. */
5899 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005900
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005901 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904#define WRITECHAR(ch) \
5905 do { \
5906 if (unicode_putchar(&v, &i, ch) < 0) \
5907 goto onError; \
5908 }while(0)
5909
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005911 case '\\': WRITECHAR('\\'); break;
5912 case '\'': WRITECHAR('\''); break;
5913 case '\"': WRITECHAR('\"'); break;
5914 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005915 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005916 case 'f': WRITECHAR('\014'); break;
5917 case 't': WRITECHAR('\t'); break;
5918 case 'n': WRITECHAR('\n'); break;
5919 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005920 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005921 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005922 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005923 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 case '0': case '1': case '2': case '3':
5927 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005928 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005929 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005930 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005931 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005932 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005934 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 break;
5936
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 /* hex escapes */
5938 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005940 digits = 2;
5941 message = "truncated \\xXX escape";
5942 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005946 digits = 4;
5947 message = "truncated \\uXXXX escape";
5948 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005951 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005952 digits = 8;
5953 message = "truncated \\UXXXXXXXX escape";
5954 hexescape:
5955 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956 if (s+digits>end) {
5957 endinpos = size;
5958 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 errors, &errorHandler,
5960 "unicodeescape", "end of string in escape sequence",
5961 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005962 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963 goto onError;
5964 goto nextByte;
5965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005966 for (j = 0; j < digits; ++j) {
5967 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005968 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005969 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 errors, &errorHandler,
5972 "unicodeescape", message,
5973 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005974 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005975 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005976 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005977 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005978 }
5979 chr = (chr<<4) & ~0xF;
5980 if (c >= '0' && c <= '9')
5981 chr += c - '0';
5982 else if (c >= 'a' && c <= 'f')
5983 chr += 10 + c - 'a';
5984 else
5985 chr += 10 + c - 'A';
5986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005987 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005988 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 /* _decoding_error will have already written into the
5990 target buffer. */
5991 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005992 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005993 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005994 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005995 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005996 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 errors, &errorHandler,
6000 "unicodeescape", "illegal Unicode character",
6001 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006002 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006003 goto onError;
6004 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006005 break;
6006
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006008 case 'N':
6009 message = "malformed \\N character escape";
6010 if (ucnhash_CAPI == NULL) {
6011 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006012 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6013 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006014 if (ucnhash_CAPI == NULL)
6015 goto ucnhashError;
6016 }
6017 if (*s == '{') {
6018 const char *start = s+1;
6019 /* look for the closing brace */
6020 while (*s != '}' && s < end)
6021 s++;
6022 if (s > start && s < end && *s == '}') {
6023 /* found a name. look it up in the unicode database */
6024 message = "unknown Unicode character name";
6025 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006026 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006027 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006028 goto store;
6029 }
6030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006031 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 errors, &errorHandler,
6034 "unicodeescape", message,
6035 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006036 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006037 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006038 break;
6039
6040 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006041 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 message = "\\ at end of string";
6043 s--;
6044 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 errors, &errorHandler,
6047 "unicodeescape", message,
6048 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006049 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006050 goto onError;
6051 }
6052 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006053 WRITECHAR('\\');
6054 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006055 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006056 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006061#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006062
Victor Stinner16e6a802011-12-12 13:24:15 +01006063 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006064 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006065 Py_XDECREF(errorHandler);
6066 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006067 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006068
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006070 PyErr_SetString(
6071 PyExc_UnicodeError,
6072 "\\N escapes not supported (can't load unicodedata module)"
6073 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006074 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075 Py_XDECREF(errorHandler);
6076 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006077 return NULL;
6078
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 Py_XDECREF(errorHandler);
6082 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 return NULL;
6084}
6085
6086/* Return a Unicode-Escape string version of the Unicode object.
6087
6088 If quotes is true, the string is enclosed in u"" or u'' quotes as
6089 appropriate.
6090
6091*/
6092
Alexander Belopolsky40018472011-02-26 01:02:56 +00006093PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006094PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006096 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006097 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006099 int kind;
6100 void *data;
6101 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102
Thomas Wouters89f507f2006-12-13 04:49:30 +00006103 /* Initial allocation is based on the longest-possible unichr
6104 escape.
6105
6106 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6107 unichr, so in this case it's the longest unichr escape. In
6108 narrow (UTF-16) builds this is five chars per source unichr
6109 since there are two unichrs in the surrogate pair, so in narrow
6110 (UTF-16) builds it's not the longest unichr escape.
6111
6112 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6113 so in the narrow (UTF-16) build case it's the longest unichr
6114 escape.
6115 */
6116
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006117 if (!PyUnicode_Check(unicode)) {
6118 PyErr_BadArgument();
6119 return NULL;
6120 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006121 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006122 return NULL;
6123 len = PyUnicode_GET_LENGTH(unicode);
6124 kind = PyUnicode_KIND(unicode);
6125 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006126 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6128 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6129 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6130 }
6131
6132 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006133 return PyBytes_FromStringAndSize(NULL, 0);
6134
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006135 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006137
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006138 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 if (repr == NULL)
6143 return NULL;
6144
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006145 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006148 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006149
Walter Dörwald79e913e2007-05-12 11:08:06 +00006150 /* Escape backslashes */
6151 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 *p++ = '\\';
6153 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006154 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006156
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006157 /* Map 21-bit characters to '\U00xxxxxx' */
6158 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006159 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006160 *p++ = '\\';
6161 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006162 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6163 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6164 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6165 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6166 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6167 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6168 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6169 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006171 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006172
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006174 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 *p++ = '\\';
6176 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006177 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6178 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6179 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6180 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006182
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006183 /* Map special whitespace to '\t', \n', '\r' */
6184 else if (ch == '\t') {
6185 *p++ = '\\';
6186 *p++ = 't';
6187 }
6188 else if (ch == '\n') {
6189 *p++ = '\\';
6190 *p++ = 'n';
6191 }
6192 else if (ch == '\r') {
6193 *p++ = '\\';
6194 *p++ = 'r';
6195 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006196
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006197 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006198 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006200 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006201 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6202 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006203 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006204
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 /* Copy everything else as-is */
6206 else
6207 *p++ = (char) ch;
6208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006210 assert(p - PyBytes_AS_STRING(repr) > 0);
6211 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6212 return NULL;
6213 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214}
6215
Alexander Belopolsky40018472011-02-26 01:02:56 +00006216PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6218 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006220 PyObject *result;
6221 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6222 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224 result = PyUnicode_AsUnicodeEscapeString(tmp);
6225 Py_DECREF(tmp);
6226 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227}
6228
6229/* --- Raw Unicode Escape Codec ------------------------------------------- */
6230
Alexander Belopolsky40018472011-02-26 01:02:56 +00006231PyObject *
6232PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006233 Py_ssize_t size,
6234 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006236 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006237 Py_ssize_t startinpos;
6238 Py_ssize_t endinpos;
6239 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006240 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 const char *end;
6242 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006243 PyObject *errorHandler = NULL;
6244 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006245
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 /* Escaped strings will always be longer than the resulting
6247 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006248 length after conversion to the true value. (But decoding error
6249 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006250 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006254 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006255 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 end = s + size;
6257 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 unsigned char c;
6259 Py_UCS4 x;
6260 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006261 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 /* Non-escape characters are interpreted as Unicode ordinals */
6264 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006265 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6266 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006268 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 startinpos = s-starts;
6270
6271 /* \u-escapes are only interpreted iff the number of leading
6272 backslashes if odd */
6273 bs = s;
6274 for (;s < end;) {
6275 if (*s != '\\')
6276 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006277 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6278 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 }
6280 if (((s - bs) & 1) == 0 ||
6281 s >= end ||
6282 (*s != 'u' && *s != 'U')) {
6283 continue;
6284 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006285 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 count = *s=='u' ? 4 : 8;
6287 s++;
6288
6289 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 for (x = 0, i = 0; i < count; ++i, ++s) {
6291 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006292 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 endinpos = s-starts;
6294 if (unicode_decode_call_errorhandler(
6295 errors, &errorHandler,
6296 "rawunicodeescape", "truncated \\uXXXX",
6297 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006298 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 goto onError;
6300 goto nextByte;
6301 }
6302 x = (x<<4) & ~0xF;
6303 if (c >= '0' && c <= '9')
6304 x += c - '0';
6305 else if (c >= 'a' && c <= 'f')
6306 x += 10 + c - 'a';
6307 else
6308 x += 10 + c - 'A';
6309 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006310 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006311 if (unicode_putchar(&v, &outpos, x) < 0)
6312 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006313 } else {
6314 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006315 if (unicode_decode_call_errorhandler(
6316 errors, &errorHandler,
6317 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006319 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006321 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 nextByte:
6323 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006325 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006327 Py_XDECREF(errorHandler);
6328 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006329 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006330
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006333 Py_XDECREF(errorHandler);
6334 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 return NULL;
6336}
6337
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006338
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006340PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006342 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 char *p;
6344 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006345 Py_ssize_t expandsize, pos;
6346 int kind;
6347 void *data;
6348 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006350 if (!PyUnicode_Check(unicode)) {
6351 PyErr_BadArgument();
6352 return NULL;
6353 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006354 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006355 return NULL;
6356 kind = PyUnicode_KIND(unicode);
6357 data = PyUnicode_DATA(unicode);
6358 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006359 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6360 bytes, and 1 byte characters 4. */
6361 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006362
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006363 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006365
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006366 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 if (repr == NULL)
6368 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006369 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006370 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006372 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006373 for (pos = 0; pos < len; pos++) {
6374 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 /* Map 32-bit characters to '\Uxxxxxxxx' */
6376 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006377 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006378 *p++ = '\\';
6379 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006380 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6381 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6382 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6383 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6384 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6385 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6386 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6387 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006388 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006390 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 *p++ = '\\';
6392 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006393 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6394 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6395 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6396 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 /* Copy everything else as-is */
6399 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 *p++ = (char) ch;
6401 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006402
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403 assert(p > q);
6404 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006405 return NULL;
6406 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407}
6408
Alexander Belopolsky40018472011-02-26 01:02:56 +00006409PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6411 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006413 PyObject *result;
6414 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6415 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006416 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006417 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6418 Py_DECREF(tmp);
6419 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420}
6421
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006422/* --- Unicode Internal Codec ------------------------------------------- */
6423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424PyObject *
6425_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006426 Py_ssize_t size,
6427 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006428{
6429 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006430 Py_ssize_t startinpos;
6431 Py_ssize_t endinpos;
6432 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006433 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006434 const char *end;
6435 const char *reason;
6436 PyObject *errorHandler = NULL;
6437 PyObject *exc = NULL;
6438
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006439 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006440 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006441 1))
6442 return NULL;
6443
Thomas Wouters89f507f2006-12-13 04:49:30 +00006444 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006445 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006446 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006448 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006449 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006450 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006451 end = s + size;
6452
6453 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006454 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006455 Py_UCS4 ch;
6456 /* We copy the raw representation one byte at a time because the
6457 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006458 ((char *) &uch)[0] = s[0];
6459 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006460#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006461 ((char *) &uch)[2] = s[2];
6462 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006463#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006464 ch = uch;
6465
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006466 /* We have to sanity check the raw data, otherwise doom looms for
6467 some malformed UCS-4 data. */
6468 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006469#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006470 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006471#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006472 end-s < Py_UNICODE_SIZE
6473 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006475 startinpos = s - starts;
6476 if (end-s < Py_UNICODE_SIZE) {
6477 endinpos = end-starts;
6478 reason = "truncated input";
6479 }
6480 else {
6481 endinpos = s - starts + Py_UNICODE_SIZE;
6482 reason = "illegal code point (> 0x10FFFF)";
6483 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006484 if (unicode_decode_call_errorhandler(
6485 errors, &errorHandler,
6486 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006487 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006488 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006489 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006490 continue;
6491 }
6492
6493 s += Py_UNICODE_SIZE;
6494#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006495 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006496 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006497 Py_UNICODE uch2;
6498 ((char *) &uch2)[0] = s[0];
6499 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006500 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006501 {
Victor Stinner551ac952011-11-29 22:58:13 +01006502 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006503 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006504 }
6505 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006506#endif
6507
6508 if (unicode_putchar(&v, &outpos, ch) < 0)
6509 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006510 }
6511
Victor Stinner16e6a802011-12-12 13:24:15 +01006512 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006513 goto onError;
6514 Py_XDECREF(errorHandler);
6515 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006516 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006519 Py_XDECREF(v);
6520 Py_XDECREF(errorHandler);
6521 Py_XDECREF(exc);
6522 return NULL;
6523}
6524
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525/* --- Latin-1 Codec ------------------------------------------------------ */
6526
Alexander Belopolsky40018472011-02-26 01:02:56 +00006527PyObject *
6528PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006529 Py_ssize_t size,
6530 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006533 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534}
6535
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006537static void
6538make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006539 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006540 PyObject *unicode,
6541 Py_ssize_t startpos, Py_ssize_t endpos,
6542 const char *reason)
6543{
6544 if (*exceptionObject == NULL) {
6545 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006546 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006547 encoding, unicode, startpos, endpos, reason);
6548 }
6549 else {
6550 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6551 goto onError;
6552 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6553 goto onError;
6554 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6555 goto onError;
6556 return;
6557 onError:
6558 Py_DECREF(*exceptionObject);
6559 *exceptionObject = NULL;
6560 }
6561}
6562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006563/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006564static void
6565raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006566 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006567 PyObject *unicode,
6568 Py_ssize_t startpos, Py_ssize_t endpos,
6569 const char *reason)
6570{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006571 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006572 encoding, unicode, startpos, endpos, reason);
6573 if (*exceptionObject != NULL)
6574 PyCodec_StrictErrors(*exceptionObject);
6575}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006576
6577/* error handling callback helper:
6578 build arguments, call the callback and check the arguments,
6579 put the result into newpos and return the replacement string, which
6580 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006581static PyObject *
6582unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006583 PyObject **errorHandler,
6584 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006585 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006586 Py_ssize_t startpos, Py_ssize_t endpos,
6587 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006589 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006590 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006591 PyObject *restuple;
6592 PyObject *resunicode;
6593
6594 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006598 }
6599
Benjamin Petersonbac79492012-01-14 13:34:47 -05006600 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006601 return NULL;
6602 len = PyUnicode_GET_LENGTH(unicode);
6603
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006604 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608
6609 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006613 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006614 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 Py_DECREF(restuple);
6616 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006617 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006618 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 &resunicode, newpos)) {
6620 Py_DECREF(restuple);
6621 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006623 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6624 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6625 Py_DECREF(restuple);
6626 return NULL;
6627 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006629 *newpos = len + *newpos;
6630 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6632 Py_DECREF(restuple);
6633 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006634 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 Py_INCREF(resunicode);
6636 Py_DECREF(restuple);
6637 return resunicode;
6638}
6639
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006641unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006642 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006643 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645 /* input state */
6646 Py_ssize_t pos=0, size;
6647 int kind;
6648 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649 /* output object */
6650 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 /* pointer into the output */
6652 char *str;
6653 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006654 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006655 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6656 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 PyObject *errorHandler = NULL;
6658 PyObject *exc = NULL;
6659 /* the following variable is used for caching string comparisons
6660 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6661 int known_errorHandler = -1;
6662
Benjamin Petersonbac79492012-01-14 13:34:47 -05006663 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 return NULL;
6665 size = PyUnicode_GET_LENGTH(unicode);
6666 kind = PyUnicode_KIND(unicode);
6667 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 /* allocate enough for a simple encoding without
6669 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006670 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006671 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006672 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006674 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006675 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 ressize = size;
6677
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 while (pos < size) {
6679 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 /* can we encode this? */
6682 if (c<limit) {
6683 /* no overflow check, because we know that the space is enough */
6684 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006685 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006686 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 Py_ssize_t requiredsize;
6689 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006690 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006692 Py_ssize_t collstart = pos;
6693 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006695 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 ++collend;
6697 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6698 if (known_errorHandler==-1) {
6699 if ((errors==NULL) || (!strcmp(errors, "strict")))
6700 known_errorHandler = 1;
6701 else if (!strcmp(errors, "replace"))
6702 known_errorHandler = 2;
6703 else if (!strcmp(errors, "ignore"))
6704 known_errorHandler = 3;
6705 else if (!strcmp(errors, "xmlcharrefreplace"))
6706 known_errorHandler = 4;
6707 else
6708 known_errorHandler = 0;
6709 }
6710 switch (known_errorHandler) {
6711 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006712 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 goto onError;
6714 case 2: /* replace */
6715 while (collstart++<collend)
6716 *str++ = '?'; /* fall through */
6717 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 break;
6720 case 4: /* xmlcharrefreplace */
6721 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722 /* determine replacement size */
6723 for (i = collstart, repsize = 0; i < collend; ++i) {
6724 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6725 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006727 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006729 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006737 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006738 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006740 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006742 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 if (requiredsize > ressize) {
6744 if (requiredsize<2*ressize)
6745 requiredsize = 2*ressize;
6746 if (_PyBytes_Resize(&res, requiredsize))
6747 goto onError;
6748 str = PyBytes_AS_STRING(res) + respos;
6749 ressize = requiredsize;
6750 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006751 /* generate replacement */
6752 for (i = collstart; i < collend; ++i) {
6753 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006755 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 break;
6757 default:
6758 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 encoding, reason, unicode, &exc,
6760 collstart, collend, &newpos);
6761 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006762 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006764 if (PyBytes_Check(repunicode)) {
6765 /* Directly copy bytes result to output. */
6766 repsize = PyBytes_Size(repunicode);
6767 if (repsize > 1) {
6768 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006769 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006770 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6771 Py_DECREF(repunicode);
6772 goto onError;
6773 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006774 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006775 ressize += repsize-1;
6776 }
6777 memcpy(str, PyBytes_AsString(repunicode), repsize);
6778 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006779 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006780 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006781 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006782 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 /* need more space? (at least enough for what we
6784 have+the replacement+the rest of the string, so
6785 we won't have to check space for encodable characters) */
6786 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006787 repsize = PyUnicode_GET_LENGTH(repunicode);
6788 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 if (requiredsize > ressize) {
6790 if (requiredsize<2*ressize)
6791 requiredsize = 2*ressize;
6792 if (_PyBytes_Resize(&res, requiredsize)) {
6793 Py_DECREF(repunicode);
6794 goto onError;
6795 }
6796 str = PyBytes_AS_STRING(res) + respos;
6797 ressize = requiredsize;
6798 }
6799 /* check if there is anything unencodable in the replacement
6800 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006801 for (i = 0; repsize-->0; ++i, ++str) {
6802 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006804 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 Py_DECREF(repunicode);
6807 goto onError;
6808 }
6809 *str = (char)c;
6810 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006811 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006812 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006813 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006814 }
6815 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006816 /* Resize if we allocated to much */
6817 size = str - PyBytes_AS_STRING(res);
6818 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006819 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006820 if (_PyBytes_Resize(&res, size) < 0)
6821 goto onError;
6822 }
6823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006824 Py_XDECREF(errorHandler);
6825 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006826 return res;
6827
6828 onError:
6829 Py_XDECREF(res);
6830 Py_XDECREF(errorHandler);
6831 Py_XDECREF(exc);
6832 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006833}
6834
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006835/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006836PyObject *
6837PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006838 Py_ssize_t size,
6839 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006841 PyObject *result;
6842 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6843 if (unicode == NULL)
6844 return NULL;
6845 result = unicode_encode_ucs1(unicode, errors, 256);
6846 Py_DECREF(unicode);
6847 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848}
6849
Alexander Belopolsky40018472011-02-26 01:02:56 +00006850PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006851_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852{
6853 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 PyErr_BadArgument();
6855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006857 if (PyUnicode_READY(unicode) == -1)
6858 return NULL;
6859 /* Fast path: if it is a one-byte string, construct
6860 bytes object directly. */
6861 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6862 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6863 PyUnicode_GET_LENGTH(unicode));
6864 /* Non-Latin-1 characters present. Defer to above function to
6865 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006866 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006867}
6868
6869PyObject*
6870PyUnicode_AsLatin1String(PyObject *unicode)
6871{
6872 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873}
6874
6875/* --- 7-bit ASCII Codec -------------------------------------------------- */
6876
Alexander Belopolsky40018472011-02-26 01:02:56 +00006877PyObject *
6878PyUnicode_DecodeASCII(const char *s,
6879 Py_ssize_t size,
6880 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006882 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006883 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006884 int kind;
6885 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006886 Py_ssize_t startinpos;
6887 Py_ssize_t endinpos;
6888 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006889 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006890 int has_error;
6891 const unsigned char *p = (const unsigned char *)s;
6892 const unsigned char *end = p + size;
6893 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894 PyObject *errorHandler = NULL;
6895 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006896
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006897 if (size == 0) {
6898 Py_INCREF(unicode_empty);
6899 return unicode_empty;
6900 }
6901
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006903 if (size == 1 && (unsigned char)s[0] < 128)
6904 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905
Victor Stinner702c7342011-10-05 13:50:52 +02006906 has_error = 0;
6907 while (p < end && !has_error) {
6908 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6909 an explanation. */
6910 if (!((size_t) p & LONG_PTR_MASK)) {
6911 /* Help register allocation */
6912 register const unsigned char *_p = p;
6913 while (_p < aligned_end) {
6914 unsigned long value = *(unsigned long *) _p;
6915 if (value & ASCII_CHAR_MASK) {
6916 has_error = 1;
6917 break;
6918 }
6919 _p += SIZEOF_LONG;
6920 }
6921 if (_p == end)
6922 break;
6923 if (has_error)
6924 break;
6925 p = _p;
6926 }
6927 if (*p & 0x80) {
6928 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006929 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006930 }
6931 else {
6932 ++p;
6933 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006934 }
Victor Stinner702c7342011-10-05 13:50:52 +02006935 if (!has_error)
6936 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006937
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006938 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006942 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006943 kind = PyUnicode_KIND(v);
6944 data = PyUnicode_DATA(v);
6945 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 e = s + size;
6947 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 register unsigned char c = (unsigned char)*s;
6949 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006950 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 ++s;
6952 }
6953 else {
6954 startinpos = s-starts;
6955 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006956 if (unicode_decode_call_errorhandler(
6957 errors, &errorHandler,
6958 "ascii", "ordinal not in range(128)",
6959 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006960 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006962 kind = PyUnicode_KIND(v);
6963 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006966 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006967 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006968 Py_XDECREF(errorHandler);
6969 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006970 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006971 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006972
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006975 Py_XDECREF(errorHandler);
6976 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 return NULL;
6978}
6979
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006980/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006981PyObject *
6982PyUnicode_EncodeASCII(const Py_UNICODE *p,
6983 Py_ssize_t size,
6984 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006986 PyObject *result;
6987 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6988 if (unicode == NULL)
6989 return NULL;
6990 result = unicode_encode_ucs1(unicode, errors, 128);
6991 Py_DECREF(unicode);
6992 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993}
6994
Alexander Belopolsky40018472011-02-26 01:02:56 +00006995PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006996_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
6998 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 PyErr_BadArgument();
7000 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007002 if (PyUnicode_READY(unicode) == -1)
7003 return NULL;
7004 /* Fast path: if it is an ASCII-only string, construct bytes object
7005 directly. Else defer to above function to raise the exception. */
7006 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7007 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7008 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007009 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007010}
7011
7012PyObject *
7013PyUnicode_AsASCIIString(PyObject *unicode)
7014{
7015 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016}
7017
Victor Stinner99b95382011-07-04 14:23:54 +02007018#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007019
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007020/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007021
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007022#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023#define NEED_RETRY
7024#endif
7025
Victor Stinner3a50e702011-10-18 21:21:00 +02007026#ifndef WC_ERR_INVALID_CHARS
7027# define WC_ERR_INVALID_CHARS 0x0080
7028#endif
7029
7030static char*
7031code_page_name(UINT code_page, PyObject **obj)
7032{
7033 *obj = NULL;
7034 if (code_page == CP_ACP)
7035 return "mbcs";
7036 if (code_page == CP_UTF7)
7037 return "CP_UTF7";
7038 if (code_page == CP_UTF8)
7039 return "CP_UTF8";
7040
7041 *obj = PyBytes_FromFormat("cp%u", code_page);
7042 if (*obj == NULL)
7043 return NULL;
7044 return PyBytes_AS_STRING(*obj);
7045}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007046
Alexander Belopolsky40018472011-02-26 01:02:56 +00007047static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007048is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007049{
7050 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007051 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007052
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 if (!IsDBCSLeadByteEx(code_page, *curr))
7054 return 0;
7055
7056 prev = CharPrevExA(code_page, s, curr, 0);
7057 if (prev == curr)
7058 return 1;
7059 /* FIXME: This code is limited to "true" double-byte encodings,
7060 as it assumes an incomplete character consists of a single
7061 byte. */
7062 if (curr - prev == 2)
7063 return 1;
7064 if (!IsDBCSLeadByteEx(code_page, *prev))
7065 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066 return 0;
7067}
7068
Victor Stinner3a50e702011-10-18 21:21:00 +02007069static DWORD
7070decode_code_page_flags(UINT code_page)
7071{
7072 if (code_page == CP_UTF7) {
7073 /* The CP_UTF7 decoder only supports flags=0 */
7074 return 0;
7075 }
7076 else
7077 return MB_ERR_INVALID_CHARS;
7078}
7079
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 * Decode a byte string from a Windows code page into unicode object in strict
7082 * mode.
7083 *
7084 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7085 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007087static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007088decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007089 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 const char *in,
7091 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007092{
Victor Stinner3a50e702011-10-18 21:21:00 +02007093 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007094 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007095 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096
7097 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 assert(insize > 0);
7099 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7100 if (outsize <= 0)
7101 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102
7103 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007105 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007106 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 if (*v == NULL)
7108 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110 }
7111 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007114 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117 }
7118
7119 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7121 if (outsize <= 0)
7122 goto error;
7123 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007124
Victor Stinner3a50e702011-10-18 21:21:00 +02007125error:
7126 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7127 return -2;
7128 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007129 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130}
7131
Victor Stinner3a50e702011-10-18 21:21:00 +02007132/*
7133 * Decode a byte string from a code page into unicode object with an error
7134 * handler.
7135 *
7136 * Returns consumed size if succeed, or raise a WindowsError or
7137 * UnicodeDecodeError exception and returns -1 on error.
7138 */
7139static int
7140decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007141 PyObject **v,
7142 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 const char *errors)
7144{
7145 const char *startin = in;
7146 const char *endin = in + size;
7147 const DWORD flags = decode_code_page_flags(code_page);
7148 /* Ideally, we should get reason from FormatMessage. This is the Windows
7149 2000 English version of the message. */
7150 const char *reason = "No mapping for the Unicode character exists "
7151 "in the target code page.";
7152 /* each step cannot decode more than 1 character, but a character can be
7153 represented as a surrogate pair */
7154 wchar_t buffer[2], *startout, *out;
7155 int insize, outsize;
7156 PyObject *errorHandler = NULL;
7157 PyObject *exc = NULL;
7158 PyObject *encoding_obj = NULL;
7159 char *encoding;
7160 DWORD err;
7161 int ret = -1;
7162
7163 assert(size > 0);
7164
7165 encoding = code_page_name(code_page, &encoding_obj);
7166 if (encoding == NULL)
7167 return -1;
7168
7169 if (errors == NULL || strcmp(errors, "strict") == 0) {
7170 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7171 UnicodeDecodeError. */
7172 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7173 if (exc != NULL) {
7174 PyCodec_StrictErrors(exc);
7175 Py_CLEAR(exc);
7176 }
7177 goto error;
7178 }
7179
7180 if (*v == NULL) {
7181 /* Create unicode object */
7182 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7183 PyErr_NoMemory();
7184 goto error;
7185 }
Victor Stinnerab595942011-12-17 04:59:06 +01007186 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007187 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 if (*v == NULL)
7189 goto error;
7190 startout = PyUnicode_AS_UNICODE(*v);
7191 }
7192 else {
7193 /* Extend unicode object */
7194 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7195 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7196 PyErr_NoMemory();
7197 goto error;
7198 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007199 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 goto error;
7201 startout = PyUnicode_AS_UNICODE(*v) + n;
7202 }
7203
7204 /* Decode the byte string character per character */
7205 out = startout;
7206 while (in < endin)
7207 {
7208 /* Decode a character */
7209 insize = 1;
7210 do
7211 {
7212 outsize = MultiByteToWideChar(code_page, flags,
7213 in, insize,
7214 buffer, Py_ARRAY_LENGTH(buffer));
7215 if (outsize > 0)
7216 break;
7217 err = GetLastError();
7218 if (err != ERROR_NO_UNICODE_TRANSLATION
7219 && err != ERROR_INSUFFICIENT_BUFFER)
7220 {
7221 PyErr_SetFromWindowsErr(0);
7222 goto error;
7223 }
7224 insize++;
7225 }
7226 /* 4=maximum length of a UTF-8 sequence */
7227 while (insize <= 4 && (in + insize) <= endin);
7228
7229 if (outsize <= 0) {
7230 Py_ssize_t startinpos, endinpos, outpos;
7231
7232 startinpos = in - startin;
7233 endinpos = startinpos + 1;
7234 outpos = out - PyUnicode_AS_UNICODE(*v);
7235 if (unicode_decode_call_errorhandler(
7236 errors, &errorHandler,
7237 encoding, reason,
7238 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007239 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 {
7241 goto error;
7242 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007243 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 }
7245 else {
7246 in += insize;
7247 memcpy(out, buffer, outsize * sizeof(wchar_t));
7248 out += outsize;
7249 }
7250 }
7251
7252 /* write a NUL character at the end */
7253 *out = 0;
7254
7255 /* Extend unicode object */
7256 outsize = out - startout;
7257 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007258 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007260 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007261
7262error:
7263 Py_XDECREF(encoding_obj);
7264 Py_XDECREF(errorHandler);
7265 Py_XDECREF(exc);
7266 return ret;
7267}
7268
Victor Stinner3a50e702011-10-18 21:21:00 +02007269static PyObject *
7270decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007271 const char *s, Py_ssize_t size,
7272 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007273{
Victor Stinner76a31a62011-11-04 00:05:13 +01007274 PyObject *v = NULL;
7275 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007276
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 if (code_page < 0) {
7278 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7279 return NULL;
7280 }
7281
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007282 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007283 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007284
Victor Stinner76a31a62011-11-04 00:05:13 +01007285 do
7286 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 if (size > INT_MAX) {
7289 chunk_size = INT_MAX;
7290 final = 0;
7291 done = 0;
7292 }
7293 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007294#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007295 {
7296 chunk_size = (int)size;
7297 final = (consumed == NULL);
7298 done = 1;
7299 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007300
Victor Stinner76a31a62011-11-04 00:05:13 +01007301 /* Skip trailing lead-byte unless 'final' is set */
7302 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7303 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007304
Victor Stinner76a31a62011-11-04 00:05:13 +01007305 if (chunk_size == 0 && done) {
7306 if (v != NULL)
7307 break;
7308 Py_INCREF(unicode_empty);
7309 return unicode_empty;
7310 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311
Victor Stinner76a31a62011-11-04 00:05:13 +01007312
7313 converted = decode_code_page_strict(code_page, &v,
7314 s, chunk_size);
7315 if (converted == -2)
7316 converted = decode_code_page_errors(code_page, &v,
7317 s, chunk_size,
7318 errors);
7319 assert(converted != 0);
7320
7321 if (converted < 0) {
7322 Py_XDECREF(v);
7323 return NULL;
7324 }
7325
7326 if (consumed)
7327 *consumed += converted;
7328
7329 s += converted;
7330 size -= converted;
7331 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007332
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007333 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007334}
7335
Alexander Belopolsky40018472011-02-26 01:02:56 +00007336PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007337PyUnicode_DecodeCodePageStateful(int code_page,
7338 const char *s,
7339 Py_ssize_t size,
7340 const char *errors,
7341 Py_ssize_t *consumed)
7342{
7343 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7344}
7345
7346PyObject *
7347PyUnicode_DecodeMBCSStateful(const char *s,
7348 Py_ssize_t size,
7349 const char *errors,
7350 Py_ssize_t *consumed)
7351{
7352 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7353}
7354
7355PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007356PyUnicode_DecodeMBCS(const char *s,
7357 Py_ssize_t size,
7358 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007359{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7361}
7362
Victor Stinner3a50e702011-10-18 21:21:00 +02007363static DWORD
7364encode_code_page_flags(UINT code_page, const char *errors)
7365{
7366 if (code_page == CP_UTF8) {
7367 if (winver.dwMajorVersion >= 6)
7368 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7369 and later */
7370 return WC_ERR_INVALID_CHARS;
7371 else
7372 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7373 return 0;
7374 }
7375 else if (code_page == CP_UTF7) {
7376 /* CP_UTF7 only supports flags=0 */
7377 return 0;
7378 }
7379 else {
7380 if (errors != NULL && strcmp(errors, "replace") == 0)
7381 return 0;
7382 else
7383 return WC_NO_BEST_FIT_CHARS;
7384 }
7385}
7386
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007387/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 * Encode a Unicode string to a Windows code page into a byte string in strict
7389 * mode.
7390 *
7391 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7392 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007393 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007394static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007395encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007396 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007398{
Victor Stinner554f3f02010-06-16 23:33:54 +00007399 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 BOOL *pusedDefaultChar = &usedDefaultChar;
7401 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007402 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007403 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007404 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 const DWORD flags = encode_code_page_flags(code_page, NULL);
7406 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007407 /* Create a substring so that we can get the UTF-16 representation
7408 of just the slice under consideration. */
7409 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007410
Martin v. Löwis3d325192011-11-04 18:23:06 +01007411 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007412
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007414 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007416 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007417
Victor Stinner2fc507f2011-11-04 20:06:39 +01007418 substring = PyUnicode_Substring(unicode, offset, offset+len);
7419 if (substring == NULL)
7420 return -1;
7421 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7422 if (p == NULL) {
7423 Py_DECREF(substring);
7424 return -1;
7425 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007426
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007427 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 outsize = WideCharToMultiByte(code_page, flags,
7429 p, size,
7430 NULL, 0,
7431 NULL, pusedDefaultChar);
7432 if (outsize <= 0)
7433 goto error;
7434 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007435 if (pusedDefaultChar && *pusedDefaultChar) {
7436 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007438 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007439
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007443 if (*outbytes == NULL) {
7444 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007446 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448 }
7449 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 const Py_ssize_t n = PyBytes_Size(*outbytes);
7452 if (outsize > PY_SSIZE_T_MAX - n) {
7453 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007454 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7458 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007460 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462 }
7463
7464 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 outsize = WideCharToMultiByte(code_page, flags,
7466 p, size,
7467 out, outsize,
7468 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007469 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 if (outsize <= 0)
7471 goto error;
7472 if (pusedDefaultChar && *pusedDefaultChar)
7473 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007475
Victor Stinner3a50e702011-10-18 21:21:00 +02007476error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7479 return -2;
7480 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007481 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007482}
7483
Victor Stinner3a50e702011-10-18 21:21:00 +02007484/*
7485 * Encode a Unicode string to a Windows code page into a byte string using a
7486 * error handler.
7487 *
7488 * Returns consumed characters if succeed, or raise a WindowsError and returns
7489 * -1 on other error.
7490 */
7491static int
7492encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007493 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007494 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007495{
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007497 Py_ssize_t pos = unicode_offset;
7498 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 /* Ideally, we should get reason from FormatMessage. This is the Windows
7500 2000 English version of the message. */
7501 const char *reason = "invalid character";
7502 /* 4=maximum length of a UTF-8 sequence */
7503 char buffer[4];
7504 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7505 Py_ssize_t outsize;
7506 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 PyObject *errorHandler = NULL;
7508 PyObject *exc = NULL;
7509 PyObject *encoding_obj = NULL;
7510 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007511 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 PyObject *rep;
7513 int ret = -1;
7514
7515 assert(insize > 0);
7516
7517 encoding = code_page_name(code_page, &encoding_obj);
7518 if (encoding == NULL)
7519 return -1;
7520
7521 if (errors == NULL || strcmp(errors, "strict") == 0) {
7522 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7523 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007524 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 if (exc != NULL) {
7526 PyCodec_StrictErrors(exc);
7527 Py_DECREF(exc);
7528 }
7529 Py_XDECREF(encoding_obj);
7530 return -1;
7531 }
7532
7533 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7534 pusedDefaultChar = &usedDefaultChar;
7535 else
7536 pusedDefaultChar = NULL;
7537
7538 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7539 PyErr_NoMemory();
7540 goto error;
7541 }
7542 outsize = insize * Py_ARRAY_LENGTH(buffer);
7543
7544 if (*outbytes == NULL) {
7545 /* Create string object */
7546 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7547 if (*outbytes == NULL)
7548 goto error;
7549 out = PyBytes_AS_STRING(*outbytes);
7550 }
7551 else {
7552 /* Extend string object */
7553 Py_ssize_t n = PyBytes_Size(*outbytes);
7554 if (n > PY_SSIZE_T_MAX - outsize) {
7555 PyErr_NoMemory();
7556 goto error;
7557 }
7558 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7559 goto error;
7560 out = PyBytes_AS_STRING(*outbytes) + n;
7561 }
7562
7563 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007564 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007566 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7567 wchar_t chars[2];
7568 int charsize;
7569 if (ch < 0x10000) {
7570 chars[0] = (wchar_t)ch;
7571 charsize = 1;
7572 }
7573 else {
7574 ch -= 0x10000;
7575 chars[0] = 0xd800 + (ch >> 10);
7576 chars[1] = 0xdc00 + (ch & 0x3ff);
7577 charsize = 2;
7578 }
7579
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007581 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 buffer, Py_ARRAY_LENGTH(buffer),
7583 NULL, pusedDefaultChar);
7584 if (outsize > 0) {
7585 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7586 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007587 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007588 memcpy(out, buffer, outsize);
7589 out += outsize;
7590 continue;
7591 }
7592 }
7593 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7594 PyErr_SetFromWindowsErr(0);
7595 goto error;
7596 }
7597
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 rep = unicode_encode_call_errorhandler(
7599 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007600 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007601 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 if (rep == NULL)
7603 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007604 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007605
7606 if (PyBytes_Check(rep)) {
7607 outsize = PyBytes_GET_SIZE(rep);
7608 if (outsize != 1) {
7609 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7610 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7611 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7612 Py_DECREF(rep);
7613 goto error;
7614 }
7615 out = PyBytes_AS_STRING(*outbytes) + offset;
7616 }
7617 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7618 out += outsize;
7619 }
7620 else {
7621 Py_ssize_t i;
7622 enum PyUnicode_Kind kind;
7623 void *data;
7624
Benjamin Petersonbac79492012-01-14 13:34:47 -05007625 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 Py_DECREF(rep);
7627 goto error;
7628 }
7629
7630 outsize = PyUnicode_GET_LENGTH(rep);
7631 if (outsize != 1) {
7632 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7633 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7634 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7635 Py_DECREF(rep);
7636 goto error;
7637 }
7638 out = PyBytes_AS_STRING(*outbytes) + offset;
7639 }
7640 kind = PyUnicode_KIND(rep);
7641 data = PyUnicode_DATA(rep);
7642 for (i=0; i < outsize; i++) {
7643 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7644 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007645 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 encoding, unicode,
7647 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007648 "unable to encode error handler result to ASCII");
7649 Py_DECREF(rep);
7650 goto error;
7651 }
7652 *out = (unsigned char)ch;
7653 out++;
7654 }
7655 }
7656 Py_DECREF(rep);
7657 }
7658 /* write a NUL byte */
7659 *out = 0;
7660 outsize = out - PyBytes_AS_STRING(*outbytes);
7661 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7662 if (_PyBytes_Resize(outbytes, outsize) < 0)
7663 goto error;
7664 ret = 0;
7665
7666error:
7667 Py_XDECREF(encoding_obj);
7668 Py_XDECREF(errorHandler);
7669 Py_XDECREF(exc);
7670 return ret;
7671}
7672
Victor Stinner3a50e702011-10-18 21:21:00 +02007673static PyObject *
7674encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007675 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007676 const char *errors)
7677{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007678 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007679 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007680 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007681 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007682
Benjamin Petersonbac79492012-01-14 13:34:47 -05007683 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007684 return NULL;
7685 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007686
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 if (code_page < 0) {
7688 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7689 return NULL;
7690 }
7691
Martin v. Löwis3d325192011-11-04 18:23:06 +01007692 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007693 return PyBytes_FromStringAndSize(NULL, 0);
7694
Victor Stinner7581cef2011-11-03 22:32:33 +01007695 offset = 0;
7696 do
7697 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007698#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007699 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007700 chunks. */
7701 if (len > INT_MAX/2) {
7702 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007703 done = 0;
7704 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007705 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007706#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007707 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007708 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 done = 1;
7710 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007711
Victor Stinner76a31a62011-11-04 00:05:13 +01007712 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007713 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007714 errors);
7715 if (ret == -2)
7716 ret = encode_code_page_errors(code_page, &outbytes,
7717 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007718 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007719 if (ret < 0) {
7720 Py_XDECREF(outbytes);
7721 return NULL;
7722 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007723
Victor Stinner7581cef2011-11-03 22:32:33 +01007724 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007727
Victor Stinner3a50e702011-10-18 21:21:00 +02007728 return outbytes;
7729}
7730
7731PyObject *
7732PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7733 Py_ssize_t size,
7734 const char *errors)
7735{
Victor Stinner7581cef2011-11-03 22:32:33 +01007736 PyObject *unicode, *res;
7737 unicode = PyUnicode_FromUnicode(p, size);
7738 if (unicode == NULL)
7739 return NULL;
7740 res = encode_code_page(CP_ACP, unicode, errors);
7741 Py_DECREF(unicode);
7742 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007743}
7744
7745PyObject *
7746PyUnicode_EncodeCodePage(int code_page,
7747 PyObject *unicode,
7748 const char *errors)
7749{
Victor Stinner7581cef2011-11-03 22:32:33 +01007750 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007751}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007752
Alexander Belopolsky40018472011-02-26 01:02:56 +00007753PyObject *
7754PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007755{
7756 if (!PyUnicode_Check(unicode)) {
7757 PyErr_BadArgument();
7758 return NULL;
7759 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007760 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007761}
7762
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007763#undef NEED_RETRY
7764
Victor Stinner99b95382011-07-04 14:23:54 +02007765#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007766
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767/* --- Character Mapping Codec -------------------------------------------- */
7768
Alexander Belopolsky40018472011-02-26 01:02:56 +00007769PyObject *
7770PyUnicode_DecodeCharmap(const char *s,
7771 Py_ssize_t size,
7772 PyObject *mapping,
7773 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007776 Py_ssize_t startinpos;
7777 Py_ssize_t endinpos;
7778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007779 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007780 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007781 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007782 PyObject *errorHandler = NULL;
7783 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007784
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 /* Default to Latin-1 */
7786 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007789 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007793 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007794 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007795 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007796 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007797 Py_ssize_t maplen;
7798 enum PyUnicode_Kind kind;
7799 void *data;
7800 Py_UCS4 x;
7801
Benjamin Petersonbac79492012-01-14 13:34:47 -05007802 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007803 return NULL;
7804
7805 maplen = PyUnicode_GET_LENGTH(mapping);
7806 data = PyUnicode_DATA(mapping);
7807 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 while (s < e) {
7809 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007812 x = PyUnicode_READ(kind, data, ch);
7813 else
7814 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007816 if (x == 0xfffe)
7817 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 startinpos = s-starts;
7820 endinpos = startinpos+1;
7821 if (unicode_decode_call_errorhandler(
7822 errors, &errorHandler,
7823 "charmap", "character maps to <undefined>",
7824 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007825 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 goto onError;
7827 }
7828 continue;
7829 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007830
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007831 if (unicode_putchar(&v, &outpos, x) < 0)
7832 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007835 }
7836 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 while (s < e) {
7838 unsigned char ch = *s;
7839 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007840
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7842 w = PyLong_FromLong((long)ch);
7843 if (w == NULL)
7844 goto onError;
7845 x = PyObject_GetItem(mapping, w);
7846 Py_DECREF(w);
7847 if (x == NULL) {
7848 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7849 /* No mapping found means: mapping is undefined. */
7850 PyErr_Clear();
7851 x = Py_None;
7852 Py_INCREF(x);
7853 } else
7854 goto onError;
7855 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007856
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 /* Apply mapping */
7858 if (PyLong_Check(x)) {
7859 long value = PyLong_AS_LONG(x);
7860 if (value < 0 || value > 65535) {
7861 PyErr_SetString(PyExc_TypeError,
7862 "character mapping must be in range(65536)");
7863 Py_DECREF(x);
7864 goto onError;
7865 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007866 if (unicode_putchar(&v, &outpos, value) < 0)
7867 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 }
7869 else if (x == Py_None) {
7870 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 startinpos = s-starts;
7872 endinpos = startinpos+1;
7873 if (unicode_decode_call_errorhandler(
7874 errors, &errorHandler,
7875 "charmap", "character maps to <undefined>",
7876 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007877 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 Py_DECREF(x);
7879 goto onError;
7880 }
7881 Py_DECREF(x);
7882 continue;
7883 }
7884 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007885 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007886
Benjamin Petersonbac79492012-01-14 13:34:47 -05007887 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007888 goto onError;
7889 targetsize = PyUnicode_GET_LENGTH(x);
7890
7891 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007893 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007894 PyUnicode_READ_CHAR(x, 0)) < 0)
7895 goto onError;
7896 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 else if (targetsize > 1) {
7898 /* 1-n mapping */
7899 if (targetsize > extrachars) {
7900 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 Py_ssize_t needed = (targetsize - extrachars) + \
7902 (targetsize << 2);
7903 extrachars += needed;
7904 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007905 if (unicode_resize(&v,
7906 PyUnicode_GET_LENGTH(v) + needed) < 0)
7907 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 Py_DECREF(x);
7909 goto onError;
7910 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007912 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7913 goto onError;
7914 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7915 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 extrachars -= targetsize;
7917 }
7918 /* 1-0 mapping: skip the character */
7919 }
7920 else {
7921 /* wrong return value */
7922 PyErr_SetString(PyExc_TypeError,
7923 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007924 Py_DECREF(x);
7925 goto onError;
7926 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 Py_DECREF(x);
7928 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007931 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007932 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 Py_XDECREF(errorHandler);
7934 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007935 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007936
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938 Py_XDECREF(errorHandler);
7939 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 Py_XDECREF(v);
7941 return NULL;
7942}
7943
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007944/* Charmap encoding: the lookup table */
7945
Alexander Belopolsky40018472011-02-26 01:02:56 +00007946struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 PyObject_HEAD
7948 unsigned char level1[32];
7949 int count2, count3;
7950 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007951};
7952
7953static PyObject*
7954encoding_map_size(PyObject *obj, PyObject* args)
7955{
7956 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007957 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959}
7960
7961static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007962 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 PyDoc_STR("Return the size (in bytes) of this object") },
7964 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007965};
7966
7967static void
7968encoding_map_dealloc(PyObject* o)
7969{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007970 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007971}
7972
7973static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007974 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 "EncodingMap", /*tp_name*/
7976 sizeof(struct encoding_map), /*tp_basicsize*/
7977 0, /*tp_itemsize*/
7978 /* methods */
7979 encoding_map_dealloc, /*tp_dealloc*/
7980 0, /*tp_print*/
7981 0, /*tp_getattr*/
7982 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007983 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 0, /*tp_repr*/
7985 0, /*tp_as_number*/
7986 0, /*tp_as_sequence*/
7987 0, /*tp_as_mapping*/
7988 0, /*tp_hash*/
7989 0, /*tp_call*/
7990 0, /*tp_str*/
7991 0, /*tp_getattro*/
7992 0, /*tp_setattro*/
7993 0, /*tp_as_buffer*/
7994 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7995 0, /*tp_doc*/
7996 0, /*tp_traverse*/
7997 0, /*tp_clear*/
7998 0, /*tp_richcompare*/
7999 0, /*tp_weaklistoffset*/
8000 0, /*tp_iter*/
8001 0, /*tp_iternext*/
8002 encoding_map_methods, /*tp_methods*/
8003 0, /*tp_members*/
8004 0, /*tp_getset*/
8005 0, /*tp_base*/
8006 0, /*tp_dict*/
8007 0, /*tp_descr_get*/
8008 0, /*tp_descr_set*/
8009 0, /*tp_dictoffset*/
8010 0, /*tp_init*/
8011 0, /*tp_alloc*/
8012 0, /*tp_new*/
8013 0, /*tp_free*/
8014 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008015};
8016
8017PyObject*
8018PyUnicode_BuildEncodingMap(PyObject* string)
8019{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008020 PyObject *result;
8021 struct encoding_map *mresult;
8022 int i;
8023 int need_dict = 0;
8024 unsigned char level1[32];
8025 unsigned char level2[512];
8026 unsigned char *mlevel1, *mlevel2, *mlevel3;
8027 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008028 int kind;
8029 void *data;
8030 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008032 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033 PyErr_BadArgument();
8034 return NULL;
8035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036 kind = PyUnicode_KIND(string);
8037 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038 memset(level1, 0xFF, sizeof level1);
8039 memset(level2, 0xFF, sizeof level2);
8040
8041 /* If there isn't a one-to-one mapping of NULL to \0,
8042 or if there are non-BMP characters, we need to use
8043 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008044 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045 need_dict = 1;
8046 for (i = 1; i < 256; i++) {
8047 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008048 ch = PyUnicode_READ(kind, data, i);
8049 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050 need_dict = 1;
8051 break;
8052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008053 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008054 /* unmapped character */
8055 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008056 l1 = ch >> 11;
8057 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058 if (level1[l1] == 0xFF)
8059 level1[l1] = count2++;
8060 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062 }
8063
8064 if (count2 >= 0xFF || count3 >= 0xFF)
8065 need_dict = 1;
8066
8067 if (need_dict) {
8068 PyObject *result = PyDict_New();
8069 PyObject *key, *value;
8070 if (!result)
8071 return NULL;
8072 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008073 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008074 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075 if (!key || !value)
8076 goto failed1;
8077 if (PyDict_SetItem(result, key, value) == -1)
8078 goto failed1;
8079 Py_DECREF(key);
8080 Py_DECREF(value);
8081 }
8082 return result;
8083 failed1:
8084 Py_XDECREF(key);
8085 Py_XDECREF(value);
8086 Py_DECREF(result);
8087 return NULL;
8088 }
8089
8090 /* Create a three-level trie */
8091 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8092 16*count2 + 128*count3 - 1);
8093 if (!result)
8094 return PyErr_NoMemory();
8095 PyObject_Init(result, &EncodingMapType);
8096 mresult = (struct encoding_map*)result;
8097 mresult->count2 = count2;
8098 mresult->count3 = count3;
8099 mlevel1 = mresult->level1;
8100 mlevel2 = mresult->level23;
8101 mlevel3 = mresult->level23 + 16*count2;
8102 memcpy(mlevel1, level1, 32);
8103 memset(mlevel2, 0xFF, 16*count2);
8104 memset(mlevel3, 0, 128*count3);
8105 count3 = 0;
8106 for (i = 1; i < 256; i++) {
8107 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109 /* unmapped character */
8110 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 o1 = PyUnicode_READ(kind, data, i)>>11;
8112 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 i2 = 16*mlevel1[o1] + o2;
8114 if (mlevel2[i2] == 0xFF)
8115 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 i3 = 128*mlevel2[i2] + o3;
8118 mlevel3[i3] = i;
8119 }
8120 return result;
8121}
8122
8123static int
Victor Stinner22168992011-11-20 17:09:18 +01008124encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125{
8126 struct encoding_map *map = (struct encoding_map*)mapping;
8127 int l1 = c>>11;
8128 int l2 = (c>>7) & 0xF;
8129 int l3 = c & 0x7F;
8130 int i;
8131
Victor Stinner22168992011-11-20 17:09:18 +01008132 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 if (c == 0)
8135 return 0;
8136 /* level 1*/
8137 i = map->level1[l1];
8138 if (i == 0xFF) {
8139 return -1;
8140 }
8141 /* level 2*/
8142 i = map->level23[16*i+l2];
8143 if (i == 0xFF) {
8144 return -1;
8145 }
8146 /* level 3 */
8147 i = map->level23[16*map->count2 + 128*i + l3];
8148 if (i == 0) {
8149 return -1;
8150 }
8151 return i;
8152}
8153
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008154/* Lookup the character ch in the mapping. If the character
8155 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008156 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008157static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008158charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159{
Christian Heimes217cfd12007-12-02 14:31:20 +00008160 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008161 PyObject *x;
8162
8163 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165 x = PyObject_GetItem(mapping, w);
8166 Py_DECREF(w);
8167 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8169 /* No mapping found means: mapping is undefined. */
8170 PyErr_Clear();
8171 x = Py_None;
8172 Py_INCREF(x);
8173 return x;
8174 } else
8175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008177 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008179 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 long value = PyLong_AS_LONG(x);
8181 if (value < 0 || value > 255) {
8182 PyErr_SetString(PyExc_TypeError,
8183 "character mapping must be in range(256)");
8184 Py_DECREF(x);
8185 return NULL;
8186 }
8187 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008189 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 /* wrong return value */
8193 PyErr_Format(PyExc_TypeError,
8194 "character mapping must return integer, bytes or None, not %.400s",
8195 x->ob_type->tp_name);
8196 Py_DECREF(x);
8197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 }
8199}
8200
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008201static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008202charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008203{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008204 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8205 /* exponentially overallocate to minimize reallocations */
8206 if (requiredsize < 2*outsize)
8207 requiredsize = 2*outsize;
8208 if (_PyBytes_Resize(outobj, requiredsize))
8209 return -1;
8210 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211}
8212
Benjamin Peterson14339b62009-01-31 16:36:08 +00008213typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008215} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008217 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008218 space is available. Return a new reference to the object that
8219 was put in the output buffer, or Py_None, if the mapping was undefined
8220 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008221 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008222static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008223charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008224 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226 PyObject *rep;
8227 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008228 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229
Christian Heimes90aa7642007-12-19 02:45:37 +00008230 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008231 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008233 if (res == -1)
8234 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 if (outsize<requiredsize)
8236 if (charmapencode_resize(outobj, outpos, requiredsize))
8237 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008238 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 outstart[(*outpos)++] = (char)res;
8240 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008241 }
8242
8243 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 Py_DECREF(rep);
8248 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 if (PyLong_Check(rep)) {
8251 Py_ssize_t requiredsize = *outpos+1;
8252 if (outsize<requiredsize)
8253 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8254 Py_DECREF(rep);
8255 return enc_EXCEPTION;
8256 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008257 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008259 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 else {
8261 const char *repchars = PyBytes_AS_STRING(rep);
8262 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8263 Py_ssize_t requiredsize = *outpos+repsize;
8264 if (outsize<requiredsize)
8265 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8266 Py_DECREF(rep);
8267 return enc_EXCEPTION;
8268 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008269 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 memcpy(outstart + *outpos, repchars, repsize);
8271 *outpos += repsize;
8272 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008274 Py_DECREF(rep);
8275 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276}
8277
8278/* handle an error in PyUnicode_EncodeCharmap
8279 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008280static int
8281charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008282 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008284 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008285 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286{
8287 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008288 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008289 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008290 enum PyUnicode_Kind kind;
8291 void *data;
8292 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008294 Py_ssize_t collstartpos = *inpos;
8295 Py_ssize_t collendpos = *inpos+1;
8296 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 char *encoding = "charmap";
8298 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008299 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008300 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008301 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302
Benjamin Petersonbac79492012-01-14 13:34:47 -05008303 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008304 return -1;
8305 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 /* find all unencodable characters */
8307 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008308 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008309 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008310 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008311 val = encoding_map_lookup(ch, mapping);
8312 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 break;
8314 ++collendpos;
8315 continue;
8316 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008317
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008318 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8319 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 if (rep==NULL)
8321 return -1;
8322 else if (rep!=Py_None) {
8323 Py_DECREF(rep);
8324 break;
8325 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008326 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 }
8329 /* cache callback name lookup
8330 * (if not done yet, i.e. it's the first error) */
8331 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 if ((errors==NULL) || (!strcmp(errors, "strict")))
8333 *known_errorHandler = 1;
8334 else if (!strcmp(errors, "replace"))
8335 *known_errorHandler = 2;
8336 else if (!strcmp(errors, "ignore"))
8337 *known_errorHandler = 3;
8338 else if (!strcmp(errors, "xmlcharrefreplace"))
8339 *known_errorHandler = 4;
8340 else
8341 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342 }
8343 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008344 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008345 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008346 return -1;
8347 case 2: /* replace */
8348 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 x = charmapencode_output('?', mapping, res, respos);
8350 if (x==enc_EXCEPTION) {
8351 return -1;
8352 }
8353 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008354 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 return -1;
8356 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008357 }
8358 /* fall through */
8359 case 3: /* ignore */
8360 *inpos = collendpos;
8361 break;
8362 case 4: /* xmlcharrefreplace */
8363 /* generate replacement (temporarily (mis)uses p) */
8364 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 char buffer[2+29+1+1];
8366 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008367 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 for (cp = buffer; *cp; ++cp) {
8369 x = charmapencode_output(*cp, mapping, res, respos);
8370 if (x==enc_EXCEPTION)
8371 return -1;
8372 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008373 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 return -1;
8375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008376 }
8377 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008378 *inpos = collendpos;
8379 break;
8380 default:
8381 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008382 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008384 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008386 if (PyBytes_Check(repunicode)) {
8387 /* Directly copy bytes result to output. */
8388 Py_ssize_t outsize = PyBytes_Size(*res);
8389 Py_ssize_t requiredsize;
8390 repsize = PyBytes_Size(repunicode);
8391 requiredsize = *respos + repsize;
8392 if (requiredsize > outsize)
8393 /* Make room for all additional bytes. */
8394 if (charmapencode_resize(res, respos, requiredsize)) {
8395 Py_DECREF(repunicode);
8396 return -1;
8397 }
8398 memcpy(PyBytes_AsString(*res) + *respos,
8399 PyBytes_AsString(repunicode), repsize);
8400 *respos += repsize;
8401 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008402 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008403 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008404 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008405 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008406 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008407 Py_DECREF(repunicode);
8408 return -1;
8409 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008410 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008411 data = PyUnicode_DATA(repunicode);
8412 kind = PyUnicode_KIND(repunicode);
8413 for (index = 0; index < repsize; index++) {
8414 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8415 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008417 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 return -1;
8419 }
8420 else if (x==enc_FAILED) {
8421 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008422 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return -1;
8424 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008425 }
8426 *inpos = newpos;
8427 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 }
8429 return 0;
8430}
8431
Alexander Belopolsky40018472011-02-26 01:02:56 +00008432PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008433_PyUnicode_EncodeCharmap(PyObject *unicode,
8434 PyObject *mapping,
8435 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 /* output object */
8438 PyObject *res = NULL;
8439 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008440 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008441 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008443 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 PyObject *errorHandler = NULL;
8445 PyObject *exc = NULL;
8446 /* the following variable is used for caching string comparisons
8447 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8448 * 3=ignore, 4=xmlcharrefreplace */
8449 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450
Benjamin Petersonbac79492012-01-14 13:34:47 -05008451 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008452 return NULL;
8453 size = PyUnicode_GET_LENGTH(unicode);
8454
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 /* Default to Latin-1 */
8456 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008457 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 /* allocate enough for a simple encoding without
8460 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008461 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 if (res == NULL)
8463 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008464 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008467 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008468 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008470 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 if (x==enc_EXCEPTION) /* error */
8472 goto onError;
8473 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008474 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 &exc,
8476 &known_errorHandler, &errorHandler, errors,
8477 &res, &respos)) {
8478 goto onError;
8479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 else
8482 /* done with this character => adjust input position */
8483 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008487 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008488 if (_PyBytes_Resize(&res, respos) < 0)
8489 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008491 Py_XDECREF(exc);
8492 Py_XDECREF(errorHandler);
8493 return res;
8494
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496 Py_XDECREF(res);
8497 Py_XDECREF(exc);
8498 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 return NULL;
8500}
8501
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008502/* Deprecated */
8503PyObject *
8504PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8505 Py_ssize_t size,
8506 PyObject *mapping,
8507 const char *errors)
8508{
8509 PyObject *result;
8510 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8511 if (unicode == NULL)
8512 return NULL;
8513 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8514 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008515 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008516}
8517
Alexander Belopolsky40018472011-02-26 01:02:56 +00008518PyObject *
8519PyUnicode_AsCharmapString(PyObject *unicode,
8520 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521{
8522 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 PyErr_BadArgument();
8524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008526 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527}
8528
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008530static void
8531make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008533 Py_ssize_t startpos, Py_ssize_t endpos,
8534 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008536 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 *exceptionObject = _PyUnicodeTranslateError_Create(
8538 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 }
8540 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8542 goto onError;
8543 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8544 goto onError;
8545 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8546 goto onError;
8547 return;
8548 onError:
8549 Py_DECREF(*exceptionObject);
8550 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 }
8552}
8553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008555static void
8556raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008558 Py_ssize_t startpos, Py_ssize_t endpos,
8559 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560{
8561 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565}
8566
8567/* error handling callback helper:
8568 build arguments, call the callback and check the arguments,
8569 put the result into newpos and return the replacement string, which
8570 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008571static PyObject *
8572unicode_translate_call_errorhandler(const char *errors,
8573 PyObject **errorHandler,
8574 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008576 Py_ssize_t startpos, Py_ssize_t endpos,
8577 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008579 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008581 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 PyObject *restuple;
8583 PyObject *resunicode;
8584
8585 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 }
8590
8591 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595
8596 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008601 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 Py_DECREF(restuple);
8603 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008604 }
8605 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 &resunicode, &i_newpos)) {
8607 Py_DECREF(restuple);
8608 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008610 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008612 else
8613 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8616 Py_DECREF(restuple);
8617 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619 Py_INCREF(resunicode);
8620 Py_DECREF(restuple);
8621 return resunicode;
8622}
8623
8624/* Lookup the character ch in the mapping and put the result in result,
8625 which must be decrefed by the caller.
8626 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008627static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629{
Christian Heimes217cfd12007-12-02 14:31:20 +00008630 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 PyObject *x;
8632
8633 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 x = PyObject_GetItem(mapping, w);
8636 Py_DECREF(w);
8637 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8639 /* No mapping found means: use 1:1 mapping. */
8640 PyErr_Clear();
8641 *result = NULL;
8642 return 0;
8643 } else
8644 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 }
8646 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 *result = x;
8648 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008650 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 long value = PyLong_AS_LONG(x);
8652 long max = PyUnicode_GetMax();
8653 if (value < 0 || value > max) {
8654 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008655 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 Py_DECREF(x);
8657 return -1;
8658 }
8659 *result = x;
8660 return 0;
8661 }
8662 else if (PyUnicode_Check(x)) {
8663 *result = x;
8664 return 0;
8665 }
8666 else {
8667 /* wrong return value */
8668 PyErr_SetString(PyExc_TypeError,
8669 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008670 Py_DECREF(x);
8671 return -1;
8672 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673}
8674/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 if not reallocate and adjust various state variables.
8676 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008677static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008682 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 /* exponentially overallocate to minimize reallocations */
8684 if (requiredsize < 2 * oldsize)
8685 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8687 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 }
8691 return 0;
8692}
8693/* lookup the character, put the result in the output string and adjust
8694 various state variables. Return a new reference to the object that
8695 was put in the output buffer in *result, or Py_None, if the mapping was
8696 undefined (in which case no character was written).
8697 The called must decref result.
8698 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008699static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8701 PyObject *mapping, Py_UCS4 **output,
8702 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008703 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8706 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 }
8712 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008714 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 }
8718 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 Py_ssize_t repsize;
8720 if (PyUnicode_READY(*res) == -1)
8721 return -1;
8722 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 if (repsize==1) {
8724 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 }
8727 else if (repsize!=0) {
8728 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 Py_ssize_t requiredsize = *opos +
8730 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 Py_ssize_t i;
8733 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 for(i = 0; i < repsize; i++)
8736 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 }
8739 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 return 0;
8742}
8743
Alexander Belopolsky40018472011-02-26 01:02:56 +00008744PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745_PyUnicode_TranslateCharmap(PyObject *input,
8746 PyObject *mapping,
8747 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749 /* input object */
8750 char *idata;
8751 Py_ssize_t size, i;
8752 int kind;
8753 /* output buffer */
8754 Py_UCS4 *output = NULL;
8755 Py_ssize_t osize;
8756 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759 char *reason = "character maps to <undefined>";
8760 PyObject *errorHandler = NULL;
8761 PyObject *exc = NULL;
8762 /* the following variable is used for caching string comparisons
8763 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8764 * 3=ignore, 4=xmlcharrefreplace */
8765 int known_errorHandler = -1;
8766
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 PyErr_BadArgument();
8769 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 if (PyUnicode_READY(input) == -1)
8773 return NULL;
8774 idata = (char*)PyUnicode_DATA(input);
8775 kind = PyUnicode_KIND(input);
8776 size = PyUnicode_GET_LENGTH(input);
8777 i = 0;
8778
8779 if (size == 0) {
8780 Py_INCREF(input);
8781 return input;
8782 }
8783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008784 /* allocate enough for a simple 1:1 translation without
8785 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 osize = size;
8787 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8788 opos = 0;
8789 if (output == NULL) {
8790 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 /* try to encode it */
8796 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 if (charmaptranslate_output(input, i, mapping,
8798 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 Py_XDECREF(x);
8800 goto onError;
8801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008802 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 else { /* untranslatable character */
8806 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8807 Py_ssize_t repsize;
8808 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 Py_ssize_t collstart = i;
8812 Py_ssize_t collend = i+1;
8813 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 while (collend < size) {
8817 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 goto onError;
8819 Py_XDECREF(x);
8820 if (x!=Py_None)
8821 break;
8822 ++collend;
8823 }
8824 /* cache callback name lookup
8825 * (if not done yet, i.e. it's the first error) */
8826 if (known_errorHandler==-1) {
8827 if ((errors==NULL) || (!strcmp(errors, "strict")))
8828 known_errorHandler = 1;
8829 else if (!strcmp(errors, "replace"))
8830 known_errorHandler = 2;
8831 else if (!strcmp(errors, "ignore"))
8832 known_errorHandler = 3;
8833 else if (!strcmp(errors, "xmlcharrefreplace"))
8834 known_errorHandler = 4;
8835 else
8836 known_errorHandler = 0;
8837 }
8838 switch (known_errorHandler) {
8839 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 raise_translate_exception(&exc, input, collstart,
8841 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008842 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 case 2: /* replace */
8844 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 for (coll = collstart; coll<collend; coll++)
8846 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 /* fall through */
8848 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 break;
8851 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 /* generate replacement (temporarily (mis)uses i) */
8853 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 char buffer[2+29+1+1];
8855 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8857 if (charmaptranslate_makespace(&output, &osize,
8858 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 goto onError;
8860 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 break;
8865 default:
8866 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 reason, input, &exc,
8868 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008869 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008871 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008872 Py_DECREF(repunicode);
8873 goto onError;
8874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 repsize = PyUnicode_GET_LENGTH(repunicode);
8877 if (charmaptranslate_makespace(&output, &osize,
8878 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 Py_DECREF(repunicode);
8880 goto onError;
8881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 for (uni2 = 0; repsize-->0; ++uni2)
8883 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8884 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008886 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008887 }
8888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8890 if (!res)
8891 goto onError;
8892 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008893 Py_XDECREF(exc);
8894 Py_XDECREF(errorHandler);
8895 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008899 Py_XDECREF(exc);
8900 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901 return NULL;
8902}
8903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904/* Deprecated. Use PyUnicode_Translate instead. */
8905PyObject *
8906PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8907 Py_ssize_t size,
8908 PyObject *mapping,
8909 const char *errors)
8910{
8911 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8912 if (!unicode)
8913 return NULL;
8914 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8915}
8916
Alexander Belopolsky40018472011-02-26 01:02:56 +00008917PyObject *
8918PyUnicode_Translate(PyObject *str,
8919 PyObject *mapping,
8920 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921{
8922 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008923
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 str = PyUnicode_FromObject(str);
8925 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928 Py_DECREF(str);
8929 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008930
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 Py_XDECREF(str);
8933 return NULL;
8934}
Tim Petersced69f82003-09-16 20:30:58 +00008935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008937fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938{
8939 /* No need to call PyUnicode_READY(self) because this function is only
8940 called as a callback from fixup() which does it already. */
8941 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8942 const int kind = PyUnicode_KIND(self);
8943 void *data = PyUnicode_DATA(self);
8944 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008945 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 Py_ssize_t i;
8947
8948 for (i = 0; i < len; ++i) {
8949 ch = PyUnicode_READ(kind, data, i);
8950 fixed = 0;
8951 if (ch > 127) {
8952 if (Py_UNICODE_ISSPACE(ch))
8953 fixed = ' ';
8954 else {
8955 const int decimal = Py_UNICODE_TODECIMAL(ch);
8956 if (decimal >= 0)
8957 fixed = '0' + decimal;
8958 }
8959 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008960 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 if (fixed > maxchar)
8962 maxchar = fixed;
8963 PyUnicode_WRITE(kind, data, i, fixed);
8964 }
8965 else if (ch > maxchar)
8966 maxchar = ch;
8967 }
8968 else if (ch > maxchar)
8969 maxchar = ch;
8970 }
8971
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008972 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973}
8974
8975PyObject *
8976_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8977{
8978 if (!PyUnicode_Check(unicode)) {
8979 PyErr_BadInternalCall();
8980 return NULL;
8981 }
8982 if (PyUnicode_READY(unicode) == -1)
8983 return NULL;
8984 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8985 /* If the string is already ASCII, just return the same string */
8986 Py_INCREF(unicode);
8987 return unicode;
8988 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008989 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990}
8991
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008992PyObject *
8993PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8994 Py_ssize_t length)
8995{
Victor Stinnerf0124502011-11-21 23:12:56 +01008996 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008997 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008998 Py_UCS4 maxchar;
8999 enum PyUnicode_Kind kind;
9000 void *data;
9001
Victor Stinner99d7ad02012-02-22 13:37:39 +01009002 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009003 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009004 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009005 if (ch > 127) {
9006 int decimal = Py_UNICODE_TODECIMAL(ch);
9007 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009008 ch = '0' + decimal;
Victor Stinner99d7ad02012-02-22 13:37:39 +01009009 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009010 }
9011 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009012
9013 /* Copy to a new string */
9014 decimal = PyUnicode_New(length, maxchar);
9015 if (decimal == NULL)
9016 return decimal;
9017 kind = PyUnicode_KIND(decimal);
9018 data = PyUnicode_DATA(decimal);
9019 /* Iterate over code points */
9020 for (i = 0; i < length; i++) {
9021 Py_UNICODE ch = s[i];
9022 if (ch > 127) {
9023 int decimal = Py_UNICODE_TODECIMAL(ch);
9024 if (decimal >= 0)
9025 ch = '0' + decimal;
9026 }
9027 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009029 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009030}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009031/* --- Decimal Encoder ---------------------------------------------------- */
9032
Alexander Belopolsky40018472011-02-26 01:02:56 +00009033int
9034PyUnicode_EncodeDecimal(Py_UNICODE *s,
9035 Py_ssize_t length,
9036 char *output,
9037 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009038{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009039 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009040 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009041 enum PyUnicode_Kind kind;
9042 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009043
9044 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 PyErr_BadArgument();
9046 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009047 }
9048
Victor Stinner42bf7752011-11-21 22:52:58 +01009049 unicode = PyUnicode_FromUnicode(s, length);
9050 if (unicode == NULL)
9051 return -1;
9052
Benjamin Petersonbac79492012-01-14 13:34:47 -05009053 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009054 Py_DECREF(unicode);
9055 return -1;
9056 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009057 kind = PyUnicode_KIND(unicode);
9058 data = PyUnicode_DATA(unicode);
9059
Victor Stinnerb84d7232011-11-22 01:50:07 +01009060 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009061 PyObject *exc;
9062 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009064 Py_ssize_t startpos;
9065
9066 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009067
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009069 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009070 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009071 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009072 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009073 decimal = Py_UNICODE_TODECIMAL(ch);
9074 if (decimal >= 0) {
9075 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009076 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 continue;
9078 }
9079 if (0 < ch && ch < 256) {
9080 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009081 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 continue;
9083 }
Victor Stinner6345be92011-11-25 20:09:01 +01009084
Victor Stinner42bf7752011-11-21 22:52:58 +01009085 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009086 exc = NULL;
9087 raise_encode_exception(&exc, "decimal", unicode,
9088 startpos, startpos+1,
9089 "invalid decimal Unicode string");
9090 Py_XDECREF(exc);
9091 Py_DECREF(unicode);
9092 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009093 }
9094 /* 0-terminate the output string */
9095 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009096 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009097 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009098}
9099
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100/* --- Helpers ------------------------------------------------------------ */
9101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009103any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 Py_ssize_t start,
9105 Py_ssize_t end)
9106{
9107 int kind1, kind2, kind;
9108 void *buf1, *buf2;
9109 Py_ssize_t len1, len2, result;
9110
9111 kind1 = PyUnicode_KIND(s1);
9112 kind2 = PyUnicode_KIND(s2);
9113 kind = kind1 > kind2 ? kind1 : kind2;
9114 buf1 = PyUnicode_DATA(s1);
9115 buf2 = PyUnicode_DATA(s2);
9116 if (kind1 != kind)
9117 buf1 = _PyUnicode_AsKind(s1, kind);
9118 if (!buf1)
9119 return -2;
9120 if (kind2 != kind)
9121 buf2 = _PyUnicode_AsKind(s2, kind);
9122 if (!buf2) {
9123 if (kind1 != kind) PyMem_Free(buf1);
9124 return -2;
9125 }
9126 len1 = PyUnicode_GET_LENGTH(s1);
9127 len2 = PyUnicode_GET_LENGTH(s2);
9128
Victor Stinner794d5672011-10-10 03:21:36 +02009129 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009130 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009131 case PyUnicode_1BYTE_KIND:
9132 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9133 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9134 else
9135 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9136 break;
9137 case PyUnicode_2BYTE_KIND:
9138 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9139 break;
9140 case PyUnicode_4BYTE_KIND:
9141 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9142 break;
9143 default:
9144 assert(0); result = -2;
9145 }
9146 }
9147 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009148 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009149 case PyUnicode_1BYTE_KIND:
9150 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9151 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9152 else
9153 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9154 break;
9155 case PyUnicode_2BYTE_KIND:
9156 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9157 break;
9158 case PyUnicode_4BYTE_KIND:
9159 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9160 break;
9161 default:
9162 assert(0); result = -2;
9163 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 }
9165
9166 if (kind1 != kind)
9167 PyMem_Free(buf1);
9168 if (kind2 != kind)
9169 PyMem_Free(buf2);
9170
9171 return result;
9172}
9173
9174Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009175_PyUnicode_InsertThousandsGrouping(
9176 PyObject *unicode, Py_ssize_t index,
9177 Py_ssize_t n_buffer,
9178 void *digits, Py_ssize_t n_digits,
9179 Py_ssize_t min_width,
9180 const char *grouping, PyObject *thousands_sep,
9181 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182{
Victor Stinner41a863c2012-02-24 00:37:51 +01009183 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009184 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009185 Py_ssize_t thousands_sep_len;
9186 Py_ssize_t len;
9187
9188 if (unicode != NULL) {
9189 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009190 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009191 }
9192 else {
9193 kind = PyUnicode_1BYTE_KIND;
9194 data = NULL;
9195 }
9196 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9197 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9198 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9199 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009200 if (thousands_sep_kind < kind) {
9201 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9202 if (!thousands_sep_data)
9203 return -1;
9204 }
9205 else {
9206 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9207 if (!data)
9208 return -1;
9209 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009210 }
9211
Benjamin Petersonead6b532011-12-20 17:23:42 -06009212 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009214 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009215 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009216 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009217 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009218 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009219 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009220 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009221 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009222 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009223 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009224 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009226 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009227 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009228 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009229 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009230 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009232 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009233 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009234 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009235 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009236 break;
9237 default:
9238 assert(0);
9239 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009241 if (unicode != NULL && thousands_sep_kind != kind) {
9242 if (thousands_sep_kind < kind)
9243 PyMem_Free(thousands_sep_data);
9244 else
9245 PyMem_Free(data);
9246 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009247 if (unicode == NULL) {
9248 *maxchar = 127;
9249 if (len != n_digits) {
9250 *maxchar = Py_MAX(*maxchar,
9251 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9252 }
9253 }
9254 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255}
9256
9257
Thomas Wouters477c8d52006-05-27 19:21:47 +00009258/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009259#define ADJUST_INDICES(start, end, len) \
9260 if (end > len) \
9261 end = len; \
9262 else if (end < 0) { \
9263 end += len; \
9264 if (end < 0) \
9265 end = 0; \
9266 } \
9267 if (start < 0) { \
9268 start += len; \
9269 if (start < 0) \
9270 start = 0; \
9271 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009272
Alexander Belopolsky40018472011-02-26 01:02:56 +00009273Py_ssize_t
9274PyUnicode_Count(PyObject *str,
9275 PyObject *substr,
9276 Py_ssize_t start,
9277 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009279 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009280 PyObject* str_obj;
9281 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 int kind1, kind2, kind;
9283 void *buf1 = NULL, *buf2 = NULL;
9284 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009285
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009286 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009287 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009289 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009290 if (!sub_obj) {
9291 Py_DECREF(str_obj);
9292 return -1;
9293 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009294 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009295 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009296 Py_DECREF(str_obj);
9297 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298 }
Tim Petersced69f82003-09-16 20:30:58 +00009299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 kind1 = PyUnicode_KIND(str_obj);
9301 kind2 = PyUnicode_KIND(sub_obj);
9302 kind = kind1 > kind2 ? kind1 : kind2;
9303 buf1 = PyUnicode_DATA(str_obj);
9304 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009305 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 if (!buf1)
9307 goto onError;
9308 buf2 = PyUnicode_DATA(sub_obj);
9309 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009310 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 if (!buf2)
9312 goto onError;
9313 len1 = PyUnicode_GET_LENGTH(str_obj);
9314 len2 = PyUnicode_GET_LENGTH(sub_obj);
9315
9316 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009317 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009319 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9320 result = asciilib_count(
9321 ((Py_UCS1*)buf1) + start, end - start,
9322 buf2, len2, PY_SSIZE_T_MAX
9323 );
9324 else
9325 result = ucs1lib_count(
9326 ((Py_UCS1*)buf1) + start, end - start,
9327 buf2, len2, PY_SSIZE_T_MAX
9328 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 break;
9330 case PyUnicode_2BYTE_KIND:
9331 result = ucs2lib_count(
9332 ((Py_UCS2*)buf1) + start, end - start,
9333 buf2, len2, PY_SSIZE_T_MAX
9334 );
9335 break;
9336 case PyUnicode_4BYTE_KIND:
9337 result = ucs4lib_count(
9338 ((Py_UCS4*)buf1) + start, end - start,
9339 buf2, len2, PY_SSIZE_T_MAX
9340 );
9341 break;
9342 default:
9343 assert(0); result = 0;
9344 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009345
9346 Py_DECREF(sub_obj);
9347 Py_DECREF(str_obj);
9348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 if (kind1 != kind)
9350 PyMem_Free(buf1);
9351 if (kind2 != kind)
9352 PyMem_Free(buf2);
9353
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 onError:
9356 Py_DECREF(sub_obj);
9357 Py_DECREF(str_obj);
9358 if (kind1 != kind && buf1)
9359 PyMem_Free(buf1);
9360 if (kind2 != kind && buf2)
9361 PyMem_Free(buf2);
9362 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363}
9364
Alexander Belopolsky40018472011-02-26 01:02:56 +00009365Py_ssize_t
9366PyUnicode_Find(PyObject *str,
9367 PyObject *sub,
9368 Py_ssize_t start,
9369 Py_ssize_t end,
9370 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009372 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009373
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009375 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009377 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009378 if (!sub) {
9379 Py_DECREF(str);
9380 return -2;
9381 }
9382 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9383 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 Py_DECREF(str);
9385 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 }
Tim Petersced69f82003-09-16 20:30:58 +00009387
Victor Stinner794d5672011-10-10 03:21:36 +02009388 result = any_find_slice(direction,
9389 str, sub, start, end
9390 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009391
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009393 Py_DECREF(sub);
9394
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395 return result;
9396}
9397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398Py_ssize_t
9399PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9400 Py_ssize_t start, Py_ssize_t end,
9401 int direction)
9402{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009404 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 if (PyUnicode_READY(str) == -1)
9406 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009407 if (start < 0 || end < 0) {
9408 PyErr_SetString(PyExc_IndexError, "string index out of range");
9409 return -2;
9410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 if (end > PyUnicode_GET_LENGTH(str))
9412 end = PyUnicode_GET_LENGTH(str);
9413 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009414 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9415 kind, end-start, ch, direction);
9416 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009418 else
9419 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420}
9421
Alexander Belopolsky40018472011-02-26 01:02:56 +00009422static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009423tailmatch(PyObject *self,
9424 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009425 Py_ssize_t start,
9426 Py_ssize_t end,
9427 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 int kind_self;
9430 int kind_sub;
9431 void *data_self;
9432 void *data_sub;
9433 Py_ssize_t offset;
9434 Py_ssize_t i;
9435 Py_ssize_t end_sub;
9436
9437 if (PyUnicode_READY(self) == -1 ||
9438 PyUnicode_READY(substring) == -1)
9439 return 0;
9440
9441 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442 return 1;
9443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9445 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009447 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 kind_self = PyUnicode_KIND(self);
9450 data_self = PyUnicode_DATA(self);
9451 kind_sub = PyUnicode_KIND(substring);
9452 data_sub = PyUnicode_DATA(substring);
9453 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9454
9455 if (direction > 0)
9456 offset = end;
9457 else
9458 offset = start;
9459
9460 if (PyUnicode_READ(kind_self, data_self, offset) ==
9461 PyUnicode_READ(kind_sub, data_sub, 0) &&
9462 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9463 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9464 /* If both are of the same kind, memcmp is sufficient */
9465 if (kind_self == kind_sub) {
9466 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009467 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 data_sub,
9469 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009470 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 }
9472 /* otherwise we have to compare each character by first accesing it */
9473 else {
9474 /* We do not need to compare 0 and len(substring)-1 because
9475 the if statement above ensured already that they are equal
9476 when we end up here. */
9477 // TODO: honor direction and do a forward or backwards search
9478 for (i = 1; i < end_sub; ++i) {
9479 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9480 PyUnicode_READ(kind_sub, data_sub, i))
9481 return 0;
9482 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 }
9486
9487 return 0;
9488}
9489
Alexander Belopolsky40018472011-02-26 01:02:56 +00009490Py_ssize_t
9491PyUnicode_Tailmatch(PyObject *str,
9492 PyObject *substr,
9493 Py_ssize_t start,
9494 Py_ssize_t end,
9495 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009497 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009498
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499 str = PyUnicode_FromObject(str);
9500 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 substr = PyUnicode_FromObject(substr);
9503 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 Py_DECREF(str);
9505 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 }
Tim Petersced69f82003-09-16 20:30:58 +00009507
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009508 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 Py_DECREF(str);
9511 Py_DECREF(substr);
9512 return result;
9513}
9514
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515/* Apply fixfct filter to the Unicode object self and return a
9516 reference to the modified object */
9517
Alexander Belopolsky40018472011-02-26 01:02:56 +00009518static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009519fixup(PyObject *self,
9520 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 PyObject *u;
9523 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009524 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009526 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009529 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 /* fix functions return the new maximum character in a string,
9532 if the kind of the resulting unicode object does not change,
9533 everything is fine. Otherwise we need to change the string kind
9534 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009535 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009536
9537 if (maxchar_new == 0) {
9538 /* no changes */;
9539 if (PyUnicode_CheckExact(self)) {
9540 Py_DECREF(u);
9541 Py_INCREF(self);
9542 return self;
9543 }
9544 else
9545 return u;
9546 }
9547
9548 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 maxchar_new = 127;
9550 else if (maxchar_new <= 255)
9551 maxchar_new = 255;
9552 else if (maxchar_new <= 65535)
9553 maxchar_new = 65535;
9554 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009555 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556
Victor Stinnereaab6042011-12-11 22:22:39 +01009557 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009559
9560 /* In case the maximum character changed, we need to
9561 convert the string to the new category. */
9562 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9563 if (v == NULL) {
9564 Py_DECREF(u);
9565 return NULL;
9566 }
9567 if (maxchar_new > maxchar_old) {
9568 /* If the maxchar increased so that the kind changed, not all
9569 characters are representable anymore and we need to fix the
9570 string again. This only happens in very few cases. */
9571 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9572 maxchar_old = fixfct(v);
9573 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009574 }
9575 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009576 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009578 Py_DECREF(u);
9579 assert(_PyUnicode_CheckConsistency(v, 1));
9580 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581}
9582
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009583static PyObject *
9584ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009586 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9587 char *resdata, *data = PyUnicode_DATA(self);
9588 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009589
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009590 res = PyUnicode_New(len, 127);
9591 if (res == NULL)
9592 return NULL;
9593 resdata = PyUnicode_DATA(res);
9594 if (lower)
9595 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009597 _Py_bytes_upper(resdata, data, len);
9598 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599}
9600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009602handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009604 Py_ssize_t j;
9605 int final_sigma;
9606 Py_UCS4 c;
9607 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009608
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009609 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9610
9611 where ! is a negation and \p{xxx} is a character with property xxx.
9612 */
9613 for (j = i - 1; j >= 0; j--) {
9614 c = PyUnicode_READ(kind, data, j);
9615 if (!_PyUnicode_IsCaseIgnorable(c))
9616 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9619 if (final_sigma) {
9620 for (j = i + 1; j < length; j++) {
9621 c = PyUnicode_READ(kind, data, j);
9622 if (!_PyUnicode_IsCaseIgnorable(c))
9623 break;
9624 }
9625 final_sigma = j == length || !_PyUnicode_IsCased(c);
9626 }
9627 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628}
9629
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009630static int
9631lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9632 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009634 /* Obscure special case. */
9635 if (c == 0x3A3) {
9636 mapped[0] = handle_capital_sigma(kind, data, length, i);
9637 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640}
9641
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642static Py_ssize_t
9643do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009645 Py_ssize_t i, k = 0;
9646 int n_res, j;
9647 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009648
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009649 c = PyUnicode_READ(kind, data, 0);
9650 n_res = _PyUnicode_ToUpperFull(c, mapped);
9651 for (j = 0; j < n_res; j++) {
9652 if (mapped[j] > *maxchar)
9653 *maxchar = mapped[j];
9654 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656 for (i = 1; i < length; i++) {
9657 c = PyUnicode_READ(kind, data, i);
9658 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9659 for (j = 0; j < n_res; j++) {
9660 if (mapped[j] > *maxchar)
9661 *maxchar = mapped[j];
9662 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009663 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009664 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666}
9667
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668static Py_ssize_t
9669do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9670 Py_ssize_t i, k = 0;
9671
9672 for (i = 0; i < length; i++) {
9673 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9674 int n_res, j;
9675 if (Py_UNICODE_ISUPPER(c)) {
9676 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9677 }
9678 else if (Py_UNICODE_ISLOWER(c)) {
9679 n_res = _PyUnicode_ToUpperFull(c, mapped);
9680 }
9681 else {
9682 n_res = 1;
9683 mapped[0] = c;
9684 }
9685 for (j = 0; j < n_res; j++) {
9686 if (mapped[j] > *maxchar)
9687 *maxchar = mapped[j];
9688 res[k++] = mapped[j];
9689 }
9690 }
9691 return k;
9692}
9693
9694static Py_ssize_t
9695do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9696 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698 Py_ssize_t i, k = 0;
9699
9700 for (i = 0; i < length; i++) {
9701 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9702 int n_res, j;
9703 if (lower)
9704 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9705 else
9706 n_res = _PyUnicode_ToUpperFull(c, mapped);
9707 for (j = 0; j < n_res; j++) {
9708 if (mapped[j] > *maxchar)
9709 *maxchar = mapped[j];
9710 res[k++] = mapped[j];
9711 }
9712 }
9713 return k;
9714}
9715
9716static Py_ssize_t
9717do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9718{
9719 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9720}
9721
9722static Py_ssize_t
9723do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9724{
9725 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9726}
9727
Benjamin Petersone51757f2012-01-12 21:10:29 -05009728static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009729do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9730{
9731 Py_ssize_t i, k = 0;
9732
9733 for (i = 0; i < length; i++) {
9734 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9735 Py_UCS4 mapped[3];
9736 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9737 for (j = 0; j < n_res; j++) {
9738 if (mapped[j] > *maxchar)
9739 *maxchar = mapped[j];
9740 res[k++] = mapped[j];
9741 }
9742 }
9743 return k;
9744}
9745
9746static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009747do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9748{
9749 Py_ssize_t i, k = 0;
9750 int previous_is_cased;
9751
9752 previous_is_cased = 0;
9753 for (i = 0; i < length; i++) {
9754 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9755 Py_UCS4 mapped[3];
9756 int n_res, j;
9757
9758 if (previous_is_cased)
9759 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9760 else
9761 n_res = _PyUnicode_ToTitleFull(c, mapped);
9762
9763 for (j = 0; j < n_res; j++) {
9764 if (mapped[j] > *maxchar)
9765 *maxchar = mapped[j];
9766 res[k++] = mapped[j];
9767 }
9768
9769 previous_is_cased = _PyUnicode_IsCased(c);
9770 }
9771 return k;
9772}
9773
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009774static PyObject *
9775case_operation(PyObject *self,
9776 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9777{
9778 PyObject *res = NULL;
9779 Py_ssize_t length, newlength = 0;
9780 int kind, outkind;
9781 void *data, *outdata;
9782 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9783
Benjamin Petersoneea48462012-01-16 14:28:50 -05009784 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009785
9786 kind = PyUnicode_KIND(self);
9787 data = PyUnicode_DATA(self);
9788 length = PyUnicode_GET_LENGTH(self);
9789 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9790 if (tmp == NULL)
9791 return PyErr_NoMemory();
9792 newlength = perform(kind, data, length, tmp, &maxchar);
9793 res = PyUnicode_New(newlength, maxchar);
9794 if (res == NULL)
9795 goto leave;
9796 tmpend = tmp + newlength;
9797 outdata = PyUnicode_DATA(res);
9798 outkind = PyUnicode_KIND(res);
9799 switch (outkind) {
9800 case PyUnicode_1BYTE_KIND:
9801 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9802 break;
9803 case PyUnicode_2BYTE_KIND:
9804 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9805 break;
9806 case PyUnicode_4BYTE_KIND:
9807 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9808 break;
9809 default:
9810 assert(0);
9811 break;
9812 }
9813 leave:
9814 PyMem_FREE(tmp);
9815 return res;
9816}
9817
Tim Peters8ce9f162004-08-27 01:49:32 +00009818PyObject *
9819PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009822 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009824 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009825 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9826 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009827 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009829 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009831 int use_memcpy;
9832 unsigned char *res_data = NULL, *sep_data = NULL;
9833 PyObject *last_obj;
9834 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835
Tim Peters05eba1f2004-08-27 21:32:02 +00009836 fseq = PySequence_Fast(seq, "");
9837 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009838 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009839 }
9840
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009841 /* NOTE: the following code can't call back into Python code,
9842 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009843 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009844
Tim Peters05eba1f2004-08-27 21:32:02 +00009845 seqlen = PySequence_Fast_GET_SIZE(fseq);
9846 /* If empty sequence, return u"". */
9847 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009848 Py_DECREF(fseq);
9849 Py_INCREF(unicode_empty);
9850 res = unicode_empty;
9851 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009852 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009853
Tim Peters05eba1f2004-08-27 21:32:02 +00009854 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009855 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009856 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009857 if (seqlen == 1) {
9858 if (PyUnicode_CheckExact(items[0])) {
9859 res = items[0];
9860 Py_INCREF(res);
9861 Py_DECREF(fseq);
9862 return res;
9863 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009864 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009865 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009866 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009867 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009868 /* Set up sep and seplen */
9869 if (separator == NULL) {
9870 /* fall back to a blank space separator */
9871 sep = PyUnicode_FromOrdinal(' ');
9872 if (!sep)
9873 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009874 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009875 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009876 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009877 else {
9878 if (!PyUnicode_Check(separator)) {
9879 PyErr_Format(PyExc_TypeError,
9880 "separator: expected str instance,"
9881 " %.80s found",
9882 Py_TYPE(separator)->tp_name);
9883 goto onError;
9884 }
9885 if (PyUnicode_READY(separator))
9886 goto onError;
9887 sep = separator;
9888 seplen = PyUnicode_GET_LENGTH(separator);
9889 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9890 /* inc refcount to keep this code path symmetric with the
9891 above case of a blank separator */
9892 Py_INCREF(sep);
9893 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009894 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009895 }
9896
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009897 /* There are at least two things to join, or else we have a subclass
9898 * of str in the sequence.
9899 * Do a pre-pass to figure out the total amount of space we'll
9900 * need (sz), and see whether all argument are strings.
9901 */
9902 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009903#ifdef Py_DEBUG
9904 use_memcpy = 0;
9905#else
9906 use_memcpy = 1;
9907#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009908 for (i = 0; i < seqlen; i++) {
9909 const Py_ssize_t old_sz = sz;
9910 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009911 if (!PyUnicode_Check(item)) {
9912 PyErr_Format(PyExc_TypeError,
9913 "sequence item %zd: expected str instance,"
9914 " %.80s found",
9915 i, Py_TYPE(item)->tp_name);
9916 goto onError;
9917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 if (PyUnicode_READY(item) == -1)
9919 goto onError;
9920 sz += PyUnicode_GET_LENGTH(item);
9921 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009922 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009923 if (i != 0)
9924 sz += seplen;
9925 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9926 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009928 goto onError;
9929 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009930 if (use_memcpy && last_obj != NULL) {
9931 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9932 use_memcpy = 0;
9933 }
9934 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009935 }
Tim Petersced69f82003-09-16 20:30:58 +00009936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009938 if (res == NULL)
9939 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009940
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009941 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009942#ifdef Py_DEBUG
9943 use_memcpy = 0;
9944#else
9945 if (use_memcpy) {
9946 res_data = PyUnicode_1BYTE_DATA(res);
9947 kind = PyUnicode_KIND(res);
9948 if (seplen != 0)
9949 sep_data = PyUnicode_1BYTE_DATA(sep);
9950 }
9951#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009953 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009954 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009955 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009956 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009957 if (use_memcpy) {
9958 Py_MEMCPY(res_data,
9959 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009960 kind * seplen);
9961 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009962 }
9963 else {
9964 copy_characters(res, res_offset, sep, 0, seplen);
9965 res_offset += seplen;
9966 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009967 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009968 itemlen = PyUnicode_GET_LENGTH(item);
9969 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009970 if (use_memcpy) {
9971 Py_MEMCPY(res_data,
9972 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009973 kind * itemlen);
9974 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009975 }
9976 else {
9977 copy_characters(res, res_offset, item, 0, itemlen);
9978 res_offset += itemlen;
9979 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009980 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009981 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009982 if (use_memcpy)
9983 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009984 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009985 else
9986 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009987
Tim Peters05eba1f2004-08-27 21:32:02 +00009988 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009990 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992
Benjamin Peterson29060642009-01-31 22:14:21 +00009993 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009994 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009996 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997 return NULL;
9998}
9999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000#define FILL(kind, data, value, start, length) \
10001 do { \
10002 Py_ssize_t i_ = 0; \
10003 assert(kind != PyUnicode_WCHAR_KIND); \
10004 switch ((kind)) { \
10005 case PyUnicode_1BYTE_KIND: { \
10006 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10007 memset(to_, (unsigned char)value, length); \
10008 break; \
10009 } \
10010 case PyUnicode_2BYTE_KIND: { \
10011 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10012 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10013 break; \
10014 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010015 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10017 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10018 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010019 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 } \
10021 } \
10022 } while (0)
10023
Victor Stinner3fe55312012-01-04 00:33:50 +010010024Py_ssize_t
10025PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10026 Py_UCS4 fill_char)
10027{
10028 Py_ssize_t maxlen;
10029 enum PyUnicode_Kind kind;
10030 void *data;
10031
10032 if (!PyUnicode_Check(unicode)) {
10033 PyErr_BadInternalCall();
10034 return -1;
10035 }
10036 if (PyUnicode_READY(unicode) == -1)
10037 return -1;
10038 if (unicode_check_modifiable(unicode))
10039 return -1;
10040
10041 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10042 PyErr_SetString(PyExc_ValueError,
10043 "fill character is bigger than "
10044 "the string maximum character");
10045 return -1;
10046 }
10047
10048 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10049 length = Py_MIN(maxlen, length);
10050 if (length <= 0)
10051 return 0;
10052
10053 kind = PyUnicode_KIND(unicode);
10054 data = PyUnicode_DATA(unicode);
10055 FILL(kind, data, fill_char, start, length);
10056 return length;
10057}
10058
Victor Stinner9310abb2011-10-05 00:59:23 +020010059static PyObject *
10060pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010061 Py_ssize_t left,
10062 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 PyObject *u;
10066 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010067 int kind;
10068 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069
10070 if (left < 0)
10071 left = 0;
10072 if (right < 0)
10073 right = 0;
10074
Victor Stinnerc4b49542011-12-11 22:44:26 +010010075 if (left == 0 && right == 0)
10076 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10079 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010080 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10081 return NULL;
10082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10084 if (fill > maxchar)
10085 maxchar = fill;
10086 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010087 if (!u)
10088 return NULL;
10089
10090 kind = PyUnicode_KIND(u);
10091 data = PyUnicode_DATA(u);
10092 if (left)
10093 FILL(kind, data, fill, 0, left);
10094 if (right)
10095 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010096 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010097 assert(_PyUnicode_CheckConsistency(u, 1));
10098 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099}
10100
Alexander Belopolsky40018472011-02-26 01:02:56 +000010101PyObject *
10102PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105
10106 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010107 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010109 if (PyUnicode_READY(string) == -1) {
10110 Py_DECREF(string);
10111 return NULL;
10112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010113
Benjamin Petersonead6b532011-12-20 17:23:42 -060010114 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010116 if (PyUnicode_IS_ASCII(string))
10117 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010118 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010119 PyUnicode_GET_LENGTH(string), keepends);
10120 else
10121 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010122 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010123 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 break;
10125 case PyUnicode_2BYTE_KIND:
10126 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010127 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 PyUnicode_GET_LENGTH(string), keepends);
10129 break;
10130 case PyUnicode_4BYTE_KIND:
10131 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010132 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 PyUnicode_GET_LENGTH(string), keepends);
10134 break;
10135 default:
10136 assert(0);
10137 list = 0;
10138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139 Py_DECREF(string);
10140 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141}
10142
Alexander Belopolsky40018472011-02-26 01:02:56 +000010143static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010144split(PyObject *self,
10145 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010146 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 int kind1, kind2, kind;
10149 void *buf1, *buf2;
10150 Py_ssize_t len1, len2;
10151 PyObject* out;
10152
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010154 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 if (PyUnicode_READY(self) == -1)
10157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010160 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010162 if (PyUnicode_IS_ASCII(self))
10163 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010164 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010165 PyUnicode_GET_LENGTH(self), maxcount
10166 );
10167 else
10168 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010169 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010170 PyUnicode_GET_LENGTH(self), maxcount
10171 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 case PyUnicode_2BYTE_KIND:
10173 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010174 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 PyUnicode_GET_LENGTH(self), maxcount
10176 );
10177 case PyUnicode_4BYTE_KIND:
10178 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010179 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 PyUnicode_GET_LENGTH(self), maxcount
10181 );
10182 default:
10183 assert(0);
10184 return NULL;
10185 }
10186
10187 if (PyUnicode_READY(substring) == -1)
10188 return NULL;
10189
10190 kind1 = PyUnicode_KIND(self);
10191 kind2 = PyUnicode_KIND(substring);
10192 kind = kind1 > kind2 ? kind1 : kind2;
10193 buf1 = PyUnicode_DATA(self);
10194 buf2 = PyUnicode_DATA(substring);
10195 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 if (!buf1)
10198 return NULL;
10199 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010200 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 if (!buf2) {
10202 if (kind1 != kind) PyMem_Free(buf1);
10203 return NULL;
10204 }
10205 len1 = PyUnicode_GET_LENGTH(self);
10206 len2 = PyUnicode_GET_LENGTH(substring);
10207
Benjamin Petersonead6b532011-12-20 17:23:42 -060010208 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10211 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 else
10214 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010215 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 break;
10217 case PyUnicode_2BYTE_KIND:
10218 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010219 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 break;
10221 case PyUnicode_4BYTE_KIND:
10222 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010223 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 break;
10225 default:
10226 out = NULL;
10227 }
10228 if (kind1 != kind)
10229 PyMem_Free(buf1);
10230 if (kind2 != kind)
10231 PyMem_Free(buf2);
10232 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233}
10234
Alexander Belopolsky40018472011-02-26 01:02:56 +000010235static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010236rsplit(PyObject *self,
10237 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010238 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 int kind1, kind2, kind;
10241 void *buf1, *buf2;
10242 Py_ssize_t len1, len2;
10243 PyObject* out;
10244
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010245 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010246 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (PyUnicode_READY(self) == -1)
10249 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010252 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 if (PyUnicode_IS_ASCII(self))
10255 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010256 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010257 PyUnicode_GET_LENGTH(self), maxcount
10258 );
10259 else
10260 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010261 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010262 PyUnicode_GET_LENGTH(self), maxcount
10263 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 case PyUnicode_2BYTE_KIND:
10265 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010266 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 PyUnicode_GET_LENGTH(self), maxcount
10268 );
10269 case PyUnicode_4BYTE_KIND:
10270 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010271 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 PyUnicode_GET_LENGTH(self), maxcount
10273 );
10274 default:
10275 assert(0);
10276 return NULL;
10277 }
10278
10279 if (PyUnicode_READY(substring) == -1)
10280 return NULL;
10281
10282 kind1 = PyUnicode_KIND(self);
10283 kind2 = PyUnicode_KIND(substring);
10284 kind = kind1 > kind2 ? kind1 : kind2;
10285 buf1 = PyUnicode_DATA(self);
10286 buf2 = PyUnicode_DATA(substring);
10287 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 if (!buf1)
10290 return NULL;
10291 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010292 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 if (!buf2) {
10294 if (kind1 != kind) PyMem_Free(buf1);
10295 return NULL;
10296 }
10297 len1 = PyUnicode_GET_LENGTH(self);
10298 len2 = PyUnicode_GET_LENGTH(substring);
10299
Benjamin Petersonead6b532011-12-20 17:23:42 -060010300 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010302 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10303 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010305 else
10306 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010307 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 break;
10309 case PyUnicode_2BYTE_KIND:
10310 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010311 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 break;
10313 case PyUnicode_4BYTE_KIND:
10314 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010315 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 break;
10317 default:
10318 out = NULL;
10319 }
10320 if (kind1 != kind)
10321 PyMem_Free(buf1);
10322 if (kind2 != kind)
10323 PyMem_Free(buf2);
10324 return out;
10325}
10326
10327static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10329 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010331 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010333 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10334 return asciilib_find(buf1, len1, buf2, len2, offset);
10335 else
10336 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 case PyUnicode_2BYTE_KIND:
10338 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10339 case PyUnicode_4BYTE_KIND:
10340 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10341 }
10342 assert(0);
10343 return -1;
10344}
10345
10346static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10348 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010350 switch (kind) {
10351 case PyUnicode_1BYTE_KIND:
10352 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10353 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10354 else
10355 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10356 case PyUnicode_2BYTE_KIND:
10357 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10358 case PyUnicode_4BYTE_KIND:
10359 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10360 }
10361 assert(0);
10362 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010363}
10364
Alexander Belopolsky40018472011-02-26 01:02:56 +000010365static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366replace(PyObject *self, PyObject *str1,
10367 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 PyObject *u;
10370 char *sbuf = PyUnicode_DATA(self);
10371 char *buf1 = PyUnicode_DATA(str1);
10372 char *buf2 = PyUnicode_DATA(str2);
10373 int srelease = 0, release1 = 0, release2 = 0;
10374 int skind = PyUnicode_KIND(self);
10375 int kind1 = PyUnicode_KIND(str1);
10376 int kind2 = PyUnicode_KIND(str2);
10377 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10378 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10379 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010380 int mayshrink;
10381 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382
10383 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010386 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387
Victor Stinner59de0ee2011-10-07 10:01:28 +020010388 if (str1 == str2)
10389 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 if (skind < kind1)
10391 /* substring too wide to be present */
10392 goto nothing;
10393
Victor Stinner49a0a212011-10-12 23:46:10 +020010394 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10395 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10396 /* Replacing str1 with str2 may cause a maxchar reduction in the
10397 result string. */
10398 mayshrink = (maxchar_str2 < maxchar);
10399 maxchar = Py_MAX(maxchar, maxchar_str2);
10400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010402 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010404 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010406 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010407 Py_UCS4 u1, u2;
10408 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010409 Py_ssize_t index, pos;
10410 char *src;
10411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010413 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10414 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010415 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010418 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010420 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010422
10423 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10424 index = 0;
10425 src = sbuf;
10426 while (--maxcount)
10427 {
10428 pos++;
10429 src += pos * PyUnicode_KIND(self);
10430 slen -= pos;
10431 index += pos;
10432 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10433 if (pos < 0)
10434 break;
10435 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10436 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010437 }
10438 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 int rkind = skind;
10440 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010441 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 if (kind1 < rkind) {
10444 /* widen substring */
10445 buf1 = _PyUnicode_AsKind(str1, rkind);
10446 if (!buf1) goto error;
10447 release1 = 1;
10448 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010449 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 if (i < 0)
10451 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (rkind > kind2) {
10453 /* widen replacement */
10454 buf2 = _PyUnicode_AsKind(str2, rkind);
10455 if (!buf2) goto error;
10456 release2 = 1;
10457 }
10458 else if (rkind < kind2) {
10459 /* widen self and buf1 */
10460 rkind = kind2;
10461 if (release1) PyMem_Free(buf1);
10462 sbuf = _PyUnicode_AsKind(self, rkind);
10463 if (!sbuf) goto error;
10464 srelease = 1;
10465 buf1 = _PyUnicode_AsKind(str1, rkind);
10466 if (!buf1) goto error;
10467 release1 = 1;
10468 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010469 u = PyUnicode_New(slen, maxchar);
10470 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010472 assert(PyUnicode_KIND(u) == rkind);
10473 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010474
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010476 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010477 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010479 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010481
10482 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010483 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010484 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010485 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010486 if (i == -1)
10487 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010488 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010490 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010494 }
10495 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 Py_ssize_t n, i, j, ires;
10497 Py_ssize_t product, new_size;
10498 int rkind = skind;
10499 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010502 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 buf1 = _PyUnicode_AsKind(str1, rkind);
10504 if (!buf1) goto error;
10505 release1 = 1;
10506 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010507 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508 if (n == 0)
10509 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010511 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 buf2 = _PyUnicode_AsKind(str2, rkind);
10513 if (!buf2) goto error;
10514 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 rkind = kind2;
10519 sbuf = _PyUnicode_AsKind(self, rkind);
10520 if (!sbuf) goto error;
10521 srelease = 1;
10522 if (release1) PyMem_Free(buf1);
10523 buf1 = _PyUnicode_AsKind(str1, rkind);
10524 if (!buf1) goto error;
10525 release1 = 1;
10526 }
10527 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10528 PyUnicode_GET_LENGTH(str1))); */
10529 product = n * (len2-len1);
10530 if ((product / (len2-len1)) != n) {
10531 PyErr_SetString(PyExc_OverflowError,
10532 "replace string is too long");
10533 goto error;
10534 }
10535 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010536 if (new_size == 0) {
10537 Py_INCREF(unicode_empty);
10538 u = unicode_empty;
10539 goto done;
10540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10542 PyErr_SetString(PyExc_OverflowError,
10543 "replace string is too long");
10544 goto error;
10545 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010546 u = PyUnicode_New(new_size, maxchar);
10547 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010549 assert(PyUnicode_KIND(u) == rkind);
10550 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 ires = i = 0;
10552 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010553 while (n-- > 0) {
10554 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010556 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010558 if (j == -1)
10559 break;
10560 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010561 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010562 memcpy(res + rkind * ires,
10563 sbuf + rkind * i,
10564 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010566 }
10567 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010569 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010571 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010577 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010578 memcpy(res + rkind * ires,
10579 sbuf + rkind * i,
10580 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 }
10582 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010583 /* interleave */
10584 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010585 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010587 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010589 if (--n <= 0)
10590 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010591 memcpy(res + rkind * ires,
10592 sbuf + rkind * i,
10593 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 ires++;
10595 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010596 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010597 memcpy(res + rkind * ires,
10598 sbuf + rkind * i,
10599 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010600 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010601 }
10602
10603 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010604 unicode_adjust_maxchar(&u);
10605 if (u == NULL)
10606 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010608
10609 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (srelease)
10611 PyMem_FREE(sbuf);
10612 if (release1)
10613 PyMem_FREE(buf1);
10614 if (release2)
10615 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010616 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618
Benjamin Peterson29060642009-01-31 22:14:21 +000010619 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010620 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 if (srelease)
10622 PyMem_FREE(sbuf);
10623 if (release1)
10624 PyMem_FREE(buf1);
10625 if (release2)
10626 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010627 return unicode_result_unchanged(self);
10628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 error:
10630 if (srelease && sbuf)
10631 PyMem_FREE(sbuf);
10632 if (release1 && buf1)
10633 PyMem_FREE(buf1);
10634 if (release2 && buf2)
10635 PyMem_FREE(buf2);
10636 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637}
10638
10639/* --- Unicode Object Methods --------------------------------------------- */
10640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010641PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010642 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643\n\
10644Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010645characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646
10647static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010648unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010650 if (PyUnicode_READY(self) == -1)
10651 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010652 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653}
10654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010655PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010656 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657\n\
10658Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010659have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660
10661static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010662unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010664 if (PyUnicode_READY(self) == -1)
10665 return NULL;
10666 if (PyUnicode_GET_LENGTH(self) == 0)
10667 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010668 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669}
10670
Benjamin Petersond5890c82012-01-14 13:23:30 -050010671PyDoc_STRVAR(casefold__doc__,
10672 "S.casefold() -> str\n\
10673\n\
10674Return a version of S suitable for caseless comparisons.");
10675
10676static PyObject *
10677unicode_casefold(PyObject *self)
10678{
10679 if (PyUnicode_READY(self) == -1)
10680 return NULL;
10681 if (PyUnicode_IS_ASCII(self))
10682 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010683 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010684}
10685
10686
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010687/* Argument converter. Coerces to a single unicode character */
10688
10689static int
10690convert_uc(PyObject *obj, void *addr)
10691{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010693 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010694
Benjamin Peterson14339b62009-01-31 16:36:08 +000010695 uniobj = PyUnicode_FromObject(obj);
10696 if (uniobj == NULL) {
10697 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010698 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010699 return 0;
10700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010702 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010703 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010704 Py_DECREF(uniobj);
10705 return 0;
10706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010708 Py_DECREF(uniobj);
10709 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010710}
10711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010712PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010713 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010715Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010716done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
10718static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010719unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010721 Py_ssize_t marg, left;
10722 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 Py_UCS4 fillchar = ' ';
10724
Victor Stinnere9a29352011-10-01 02:14:59 +020010725 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
Benjamin Petersonbac79492012-01-14 13:34:47 -050010728 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729 return NULL;
10730
Victor Stinnerc4b49542011-12-11 22:44:26 +010010731 if (PyUnicode_GET_LENGTH(self) >= width)
10732 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733
Victor Stinnerc4b49542011-12-11 22:44:26 +010010734 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735 left = marg / 2 + (marg & width & 1);
10736
Victor Stinner9310abb2011-10-05 00:59:23 +020010737 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738}
10739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740/* This function assumes that str1 and str2 are readied by the caller. */
10741
Marc-André Lemburge5034372000-08-08 08:04:29 +000010742static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010743unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 int kind1, kind2;
10746 void *data1, *data2;
10747 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 kind1 = PyUnicode_KIND(str1);
10750 kind2 = PyUnicode_KIND(str2);
10751 data1 = PyUnicode_DATA(str1);
10752 data2 = PyUnicode_DATA(str2);
10753 len1 = PyUnicode_GET_LENGTH(str1);
10754 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 for (i = 0; i < len1 && i < len2; ++i) {
10757 Py_UCS4 c1, c2;
10758 c1 = PyUnicode_READ(kind1, data1, i);
10759 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010760
10761 if (c1 != c2)
10762 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010763 }
10764
10765 return (len1 < len2) ? -1 : (len1 != len2);
10766}
10767
Alexander Belopolsky40018472011-02-26 01:02:56 +000010768int
10769PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10772 if (PyUnicode_READY(left) == -1 ||
10773 PyUnicode_READY(right) == -1)
10774 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010775 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010777 PyErr_Format(PyExc_TypeError,
10778 "Can't compare %.100s and %.100s",
10779 left->ob_type->tp_name,
10780 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 return -1;
10782}
10783
Martin v. Löwis5b222132007-06-10 09:51:05 +000010784int
10785PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 Py_ssize_t i;
10788 int kind;
10789 void *data;
10790 Py_UCS4 chr;
10791
Victor Stinner910337b2011-10-03 03:20:16 +020010792 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 if (PyUnicode_READY(uni) == -1)
10794 return -1;
10795 kind = PyUnicode_KIND(uni);
10796 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010797 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10799 if (chr != str[i])
10800 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010801 /* This check keeps Python strings that end in '\0' from comparing equal
10802 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010804 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010805 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010806 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010807 return 0;
10808}
10809
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010810
Benjamin Peterson29060642009-01-31 22:14:21 +000010811#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010812 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010813
Alexander Belopolsky40018472011-02-26 01:02:56 +000010814PyObject *
10815PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010816{
10817 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010818
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010819 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10820 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 if (PyUnicode_READY(left) == -1 ||
10822 PyUnicode_READY(right) == -1)
10823 return NULL;
10824 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10825 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010826 if (op == Py_EQ) {
10827 Py_INCREF(Py_False);
10828 return Py_False;
10829 }
10830 if (op == Py_NE) {
10831 Py_INCREF(Py_True);
10832 return Py_True;
10833 }
10834 }
10835 if (left == right)
10836 result = 0;
10837 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010838 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010839
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010840 /* Convert the return value to a Boolean */
10841 switch (op) {
10842 case Py_EQ:
10843 v = TEST_COND(result == 0);
10844 break;
10845 case Py_NE:
10846 v = TEST_COND(result != 0);
10847 break;
10848 case Py_LE:
10849 v = TEST_COND(result <= 0);
10850 break;
10851 case Py_GE:
10852 v = TEST_COND(result >= 0);
10853 break;
10854 case Py_LT:
10855 v = TEST_COND(result == -1);
10856 break;
10857 case Py_GT:
10858 v = TEST_COND(result == 1);
10859 break;
10860 default:
10861 PyErr_BadArgument();
10862 return NULL;
10863 }
10864 Py_INCREF(v);
10865 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010866 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010867
Brian Curtindfc80e32011-08-10 20:28:54 -050010868 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010869}
10870
Alexander Belopolsky40018472011-02-26 01:02:56 +000010871int
10872PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010873{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010874 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 int kind1, kind2, kind;
10876 void *buf1, *buf2;
10877 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010878 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010879
10880 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010881 sub = PyUnicode_FromObject(element);
10882 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 PyErr_Format(PyExc_TypeError,
10884 "'in <string>' requires string as left operand, not %s",
10885 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010886 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010887 }
10888
Thomas Wouters477c8d52006-05-27 19:21:47 +000010889 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010890 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010891 Py_DECREF(sub);
10892 return -1;
10893 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010894 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10895 Py_DECREF(sub);
10896 Py_DECREF(str);
10897 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 kind1 = PyUnicode_KIND(str);
10900 kind2 = PyUnicode_KIND(sub);
10901 kind = kind1 > kind2 ? kind1 : kind2;
10902 buf1 = PyUnicode_DATA(str);
10903 buf2 = PyUnicode_DATA(sub);
10904 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010905 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 if (!buf1) {
10907 Py_DECREF(sub);
10908 return -1;
10909 }
10910 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010911 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 if (!buf2) {
10913 Py_DECREF(sub);
10914 if (kind1 != kind) PyMem_Free(buf1);
10915 return -1;
10916 }
10917 len1 = PyUnicode_GET_LENGTH(str);
10918 len2 = PyUnicode_GET_LENGTH(sub);
10919
Benjamin Petersonead6b532011-12-20 17:23:42 -060010920 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 case PyUnicode_1BYTE_KIND:
10922 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10923 break;
10924 case PyUnicode_2BYTE_KIND:
10925 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10926 break;
10927 case PyUnicode_4BYTE_KIND:
10928 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10929 break;
10930 default:
10931 result = -1;
10932 assert(0);
10933 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934
10935 Py_DECREF(str);
10936 Py_DECREF(sub);
10937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 if (kind1 != kind)
10939 PyMem_Free(buf1);
10940 if (kind2 != kind)
10941 PyMem_Free(buf2);
10942
Guido van Rossum403d68b2000-03-13 15:55:09 +000010943 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010944}
10945
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946/* Concat to string or Unicode object giving a new Unicode object. */
10947
Alexander Belopolsky40018472011-02-26 01:02:56 +000010948PyObject *
10949PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010952 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010953 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
10955 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010958 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010961 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
10963 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010964 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010965 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010968 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 }
10972
Victor Stinner488fa492011-12-12 00:01:39 +010010973 u_len = PyUnicode_GET_LENGTH(u);
10974 v_len = PyUnicode_GET_LENGTH(v);
10975 if (u_len > PY_SSIZE_T_MAX - v_len) {
10976 PyErr_SetString(PyExc_OverflowError,
10977 "strings are too large to concat");
10978 goto onError;
10979 }
10980 new_len = u_len + v_len;
10981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010983 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10984 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010987 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010989 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010990 copy_characters(w, 0, u, 0, u_len);
10991 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 Py_DECREF(u);
10993 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010994 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996
Benjamin Peterson29060642009-01-31 22:14:21 +000010997 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 Py_XDECREF(u);
10999 Py_XDECREF(v);
11000 return NULL;
11001}
11002
Walter Dörwald1ab83302007-05-18 17:15:44 +000011003void
Victor Stinner23e56682011-10-03 03:54:37 +020011004PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011005{
Victor Stinner23e56682011-10-03 03:54:37 +020011006 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011007 Py_UCS4 maxchar, maxchar2;
11008 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011009
11010 if (p_left == NULL) {
11011 if (!PyErr_Occurred())
11012 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011013 return;
11014 }
Victor Stinner23e56682011-10-03 03:54:37 +020011015 left = *p_left;
11016 if (right == NULL || !PyUnicode_Check(left)) {
11017 if (!PyErr_Occurred())
11018 PyErr_BadInternalCall();
11019 goto error;
11020 }
11021
Benjamin Petersonbac79492012-01-14 13:34:47 -050011022 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011023 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011024 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011025 goto error;
11026
Victor Stinner488fa492011-12-12 00:01:39 +010011027 /* Shortcuts */
11028 if (left == unicode_empty) {
11029 Py_DECREF(left);
11030 Py_INCREF(right);
11031 *p_left = right;
11032 return;
11033 }
11034 if (right == unicode_empty)
11035 return;
11036
11037 left_len = PyUnicode_GET_LENGTH(left);
11038 right_len = PyUnicode_GET_LENGTH(right);
11039 if (left_len > PY_SSIZE_T_MAX - right_len) {
11040 PyErr_SetString(PyExc_OverflowError,
11041 "strings are too large to concat");
11042 goto error;
11043 }
11044 new_len = left_len + right_len;
11045
11046 if (unicode_modifiable(left)
11047 && PyUnicode_CheckExact(right)
11048 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011049 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11050 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011051 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011052 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011053 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11054 {
11055 /* append inplace */
11056 if (unicode_resize(p_left, new_len) != 0) {
11057 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11058 * deallocated so it cannot be put back into
11059 * 'variable'. The MemoryError is raised when there
11060 * is no value in 'variable', which might (very
11061 * remotely) be a cause of incompatibilities.
11062 */
11063 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011064 }
Victor Stinner488fa492011-12-12 00:01:39 +010011065 /* copy 'right' into the newly allocated area of 'left' */
11066 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011067 }
Victor Stinner488fa492011-12-12 00:01:39 +010011068 else {
11069 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11070 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11071 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011072
Victor Stinner488fa492011-12-12 00:01:39 +010011073 /* Concat the two Unicode strings */
11074 res = PyUnicode_New(new_len, maxchar);
11075 if (res == NULL)
11076 goto error;
11077 copy_characters(res, 0, left, 0, left_len);
11078 copy_characters(res, left_len, right, 0, right_len);
11079 Py_DECREF(left);
11080 *p_left = res;
11081 }
11082 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011083 return;
11084
11085error:
Victor Stinner488fa492011-12-12 00:01:39 +010011086 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011087}
11088
11089void
11090PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11091{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011092 PyUnicode_Append(pleft, right);
11093 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011094}
11095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011096PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011097 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011099Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011100string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011101interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102
11103static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011104unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011106 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011107 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011108 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 int kind1, kind2, kind;
11111 void *buf1, *buf2;
11112 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113
Jesus Ceaac451502011-04-20 17:09:23 +020011114 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11115 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011116 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 kind1 = PyUnicode_KIND(self);
11119 kind2 = PyUnicode_KIND(substring);
11120 kind = kind1 > kind2 ? kind1 : kind2;
11121 buf1 = PyUnicode_DATA(self);
11122 buf2 = PyUnicode_DATA(substring);
11123 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011124 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 if (!buf1) {
11126 Py_DECREF(substring);
11127 return NULL;
11128 }
11129 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011130 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 if (!buf2) {
11132 Py_DECREF(substring);
11133 if (kind1 != kind) PyMem_Free(buf1);
11134 return NULL;
11135 }
11136 len1 = PyUnicode_GET_LENGTH(self);
11137 len2 = PyUnicode_GET_LENGTH(substring);
11138
11139 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011140 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 case PyUnicode_1BYTE_KIND:
11142 iresult = ucs1lib_count(
11143 ((Py_UCS1*)buf1) + start, end - start,
11144 buf2, len2, PY_SSIZE_T_MAX
11145 );
11146 break;
11147 case PyUnicode_2BYTE_KIND:
11148 iresult = ucs2lib_count(
11149 ((Py_UCS2*)buf1) + start, end - start,
11150 buf2, len2, PY_SSIZE_T_MAX
11151 );
11152 break;
11153 case PyUnicode_4BYTE_KIND:
11154 iresult = ucs4lib_count(
11155 ((Py_UCS4*)buf1) + start, end - start,
11156 buf2, len2, PY_SSIZE_T_MAX
11157 );
11158 break;
11159 default:
11160 assert(0); iresult = 0;
11161 }
11162
11163 result = PyLong_FromSsize_t(iresult);
11164
11165 if (kind1 != kind)
11166 PyMem_Free(buf1);
11167 if (kind2 != kind)
11168 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169
11170 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 return result;
11173}
11174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011175PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011176 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011178Encode S using the codec registered for encoding. Default encoding\n\
11179is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011180handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011181a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11182'xmlcharrefreplace' as well as any other name registered with\n\
11183codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
11185static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011186unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011188 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189 char *encoding = NULL;
11190 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011191
Benjamin Peterson308d6372009-09-18 21:42:35 +000011192 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11193 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011195 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011196}
11197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011198PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011199 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200\n\
11201Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011202If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
11204static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011205unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011207 Py_ssize_t i, j, line_pos, src_len, incr;
11208 Py_UCS4 ch;
11209 PyObject *u;
11210 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011212 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011213 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214
11215 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217
Antoine Pitrou22425222011-10-04 19:10:51 +020011218 if (PyUnicode_READY(self) == -1)
11219 return NULL;
11220
Thomas Wouters7e474022000-07-16 12:04:32 +000011221 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011222 src_len = PyUnicode_GET_LENGTH(self);
11223 i = j = line_pos = 0;
11224 kind = PyUnicode_KIND(self);
11225 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011226 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011227 for (; i < src_len; i++) {
11228 ch = PyUnicode_READ(kind, src_data, i);
11229 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011230 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011232 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011233 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011234 goto overflow;
11235 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011237 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011241 goto overflow;
11242 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011244 if (ch == '\n' || ch == '\r')
11245 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011247 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011248 if (!found)
11249 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011250
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011252 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 if (!u)
11254 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011255 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256
Antoine Pitroue71d5742011-10-04 15:55:09 +020011257 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258
Antoine Pitroue71d5742011-10-04 15:55:09 +020011259 for (; i < src_len; i++) {
11260 ch = PyUnicode_READ(kind, src_data, i);
11261 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011263 incr = tabsize - (line_pos % tabsize);
11264 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011265 FILL(kind, dest_data, ' ', j, incr);
11266 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011268 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011270 line_pos++;
11271 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011272 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011273 if (ch == '\n' || ch == '\r')
11274 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011276 }
11277 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011278 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011279
Antoine Pitroue71d5742011-10-04 15:55:09 +020011280 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011281 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283}
11284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011285PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287\n\
11288Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011289such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290arguments start and end are interpreted as in slice notation.\n\
11291\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011292Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293
11294static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011297 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011298 Py_ssize_t start;
11299 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011300 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
Jesus Ceaac451502011-04-20 17:09:23 +020011302 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11303 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 if (PyUnicode_READY(self) == -1)
11307 return NULL;
11308 if (PyUnicode_READY(substring) == -1)
11309 return NULL;
11310
Victor Stinner7931d9a2011-11-04 00:22:48 +010011311 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312
11313 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 if (result == -2)
11316 return NULL;
11317
Christian Heimes217cfd12007-12-02 14:31:20 +000011318 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319}
11320
11321static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011322unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011324 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11325 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328}
11329
Guido van Rossumc2504932007-09-18 19:42:40 +000011330/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011331 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011332static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011333unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334{
Guido van Rossumc2504932007-09-18 19:42:40 +000011335 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011336 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011337
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011338#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011339 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011340#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 if (_PyUnicode_HASH(self) != -1)
11342 return _PyUnicode_HASH(self);
11343 if (PyUnicode_READY(self) == -1)
11344 return -1;
11345 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011346 /*
11347 We make the hash of the empty string be 0, rather than using
11348 (prefix ^ suffix), since this slightly obfuscates the hash secret
11349 */
11350 if (len == 0) {
11351 _PyUnicode_HASH(self) = 0;
11352 return 0;
11353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354
11355 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011356#define HASH(P) \
11357 x ^= (Py_uhash_t) *P << 7; \
11358 while (--len >= 0) \
11359 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360
Georg Brandl2fb477c2012-02-21 00:33:36 +010011361 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 switch (PyUnicode_KIND(self)) {
11363 case PyUnicode_1BYTE_KIND: {
11364 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11365 HASH(c);
11366 break;
11367 }
11368 case PyUnicode_2BYTE_KIND: {
11369 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11370 HASH(s);
11371 break;
11372 }
11373 default: {
11374 Py_UCS4 *l;
11375 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11376 "Impossible switch case in unicode_hash");
11377 l = PyUnicode_4BYTE_DATA(self);
11378 HASH(l);
11379 break;
11380 }
11381 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011382 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11383 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384
Guido van Rossumc2504932007-09-18 19:42:40 +000011385 if (x == -1)
11386 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011388 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011392PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011393 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011395Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396
11397static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011400 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011401 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011402 Py_ssize_t start;
11403 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404
Jesus Ceaac451502011-04-20 17:09:23 +020011405 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11406 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 if (PyUnicode_READY(self) == -1)
11410 return NULL;
11411 if (PyUnicode_READY(substring) == -1)
11412 return NULL;
11413
Victor Stinner7931d9a2011-11-04 00:22:48 +010011414 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
11416 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 if (result == -2)
11419 return NULL;
11420
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 if (result < 0) {
11422 PyErr_SetString(PyExc_ValueError, "substring not found");
11423 return NULL;
11424 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011425
Christian Heimes217cfd12007-12-02 14:31:20 +000011426 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427}
11428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011432Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
11435static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011436unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 Py_ssize_t i, length;
11439 int kind;
11440 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 int cased;
11442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 if (PyUnicode_READY(self) == -1)
11444 return NULL;
11445 length = PyUnicode_GET_LENGTH(self);
11446 kind = PyUnicode_KIND(self);
11447 data = PyUnicode_DATA(self);
11448
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 if (length == 1)
11451 return PyBool_FromLong(
11452 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011454 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011457
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 for (i = 0; i < length; i++) {
11460 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011461
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11463 return PyBool_FromLong(0);
11464 else if (!cased && Py_UNICODE_ISLOWER(ch))
11465 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011467 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468}
11469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011470PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011473Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011474at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475
11476static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011477unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 Py_ssize_t i, length;
11480 int kind;
11481 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482 int cased;
11483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 if (PyUnicode_READY(self) == -1)
11485 return NULL;
11486 length = PyUnicode_GET_LENGTH(self);
11487 kind = PyUnicode_KIND(self);
11488 data = PyUnicode_DATA(self);
11489
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 if (length == 1)
11492 return PyBool_FromLong(
11493 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011495 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011498
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 for (i = 0; i < length; i++) {
11501 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011502
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11504 return PyBool_FromLong(0);
11505 else if (!cased && Py_UNICODE_ISUPPER(ch))
11506 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011508 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509}
11510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011511PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011514Return True if S is a titlecased string and there is at least one\n\
11515character in S, i.e. upper- and titlecase characters may only\n\
11516follow uncased characters and lowercase characters only cased ones.\n\
11517Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
11519static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011520unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 Py_ssize_t i, length;
11523 int kind;
11524 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525 int cased, previous_is_cased;
11526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 if (PyUnicode_READY(self) == -1)
11528 return NULL;
11529 length = PyUnicode_GET_LENGTH(self);
11530 kind = PyUnicode_KIND(self);
11531 data = PyUnicode_DATA(self);
11532
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 if (length == 1) {
11535 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11536 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11537 (Py_UNICODE_ISUPPER(ch) != 0));
11538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011540 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011543
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 cased = 0;
11545 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 for (i = 0; i < length; i++) {
11547 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011548
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11550 if (previous_is_cased)
11551 return PyBool_FromLong(0);
11552 previous_is_cased = 1;
11553 cased = 1;
11554 }
11555 else if (Py_UNICODE_ISLOWER(ch)) {
11556 if (!previous_is_cased)
11557 return PyBool_FromLong(0);
11558 previous_is_cased = 1;
11559 cased = 1;
11560 }
11561 else
11562 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011564 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565}
11566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011567PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011570Return True if all characters in S are whitespace\n\
11571and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572
11573static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011574unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 Py_ssize_t i, length;
11577 int kind;
11578 void *data;
11579
11580 if (PyUnicode_READY(self) == -1)
11581 return NULL;
11582 length = PyUnicode_GET_LENGTH(self);
11583 kind = PyUnicode_KIND(self);
11584 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 if (length == 1)
11588 return PyBool_FromLong(
11589 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011591 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 for (i = 0; i < length; i++) {
11596 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011597 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011600 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601}
11602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011603PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011605\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011606Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011607and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011608
11609static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011610unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 Py_ssize_t i, length;
11613 int kind;
11614 void *data;
11615
11616 if (PyUnicode_READY(self) == -1)
11617 return NULL;
11618 length = PyUnicode_GET_LENGTH(self);
11619 kind = PyUnicode_KIND(self);
11620 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011621
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011622 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623 if (length == 1)
11624 return PyBool_FromLong(
11625 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011626
11627 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 for (i = 0; i < length; i++) {
11632 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011634 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011635 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011636}
11637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011638PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011640\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011641Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011642and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011643
11644static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011645unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011646{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 int kind;
11648 void *data;
11649 Py_ssize_t len, i;
11650
11651 if (PyUnicode_READY(self) == -1)
11652 return NULL;
11653
11654 kind = PyUnicode_KIND(self);
11655 data = PyUnicode_DATA(self);
11656 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011657
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011658 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 if (len == 1) {
11660 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11661 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11662 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011663
11664 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011666 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 for (i = 0; i < len; i++) {
11669 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011670 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011672 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011673 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011674}
11675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011676PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011679Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011680False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681
11682static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011683unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 Py_ssize_t i, length;
11686 int kind;
11687 void *data;
11688
11689 if (PyUnicode_READY(self) == -1)
11690 return NULL;
11691 length = PyUnicode_GET_LENGTH(self);
11692 kind = PyUnicode_KIND(self);
11693 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 if (length == 1)
11697 return PyBool_FromLong(
11698 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011700 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011702 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 for (i = 0; i < length; i++) {
11705 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011708 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709}
11710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011711PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011714Return True if all characters in S are digits\n\
11715and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716
11717static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011718unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 Py_ssize_t i, length;
11721 int kind;
11722 void *data;
11723
11724 if (PyUnicode_READY(self) == -1)
11725 return NULL;
11726 length = PyUnicode_GET_LENGTH(self);
11727 kind = PyUnicode_KIND(self);
11728 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 if (length == 1) {
11732 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11733 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011736 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 for (i = 0; i < length; i++) {
11741 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011744 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745}
11746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011747PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011750Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011751False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752
11753static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011754unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 Py_ssize_t i, length;
11757 int kind;
11758 void *data;
11759
11760 if (PyUnicode_READY(self) == -1)
11761 return NULL;
11762 length = PyUnicode_GET_LENGTH(self);
11763 kind = PyUnicode_KIND(self);
11764 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 if (length == 1)
11768 return PyBool_FromLong(
11769 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011771 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 for (i = 0; i < length; i++) {
11776 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011779 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780}
11781
Martin v. Löwis47383402007-08-15 07:32:56 +000011782int
11783PyUnicode_IsIdentifier(PyObject *self)
11784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 int kind;
11786 void *data;
11787 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011788 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 if (PyUnicode_READY(self) == -1) {
11791 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 }
11794
11795 /* Special case for empty strings */
11796 if (PyUnicode_GET_LENGTH(self) == 0)
11797 return 0;
11798 kind = PyUnicode_KIND(self);
11799 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011800
11801 /* PEP 3131 says that the first character must be in
11802 XID_Start and subsequent characters in XID_Continue,
11803 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011804 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011805 letters, digits, underscore). However, given the current
11806 definition of XID_Start and XID_Continue, it is sufficient
11807 to check just for these, except that _ must be allowed
11808 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011810 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011811 return 0;
11812
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011813 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011815 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011816 return 1;
11817}
11818
11819PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011821\n\
11822Return True if S is a valid identifier according\n\
11823to the language definition.");
11824
11825static PyObject*
11826unicode_isidentifier(PyObject *self)
11827{
11828 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11829}
11830
Georg Brandl559e5d72008-06-11 18:37:52 +000011831PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011833\n\
11834Return True if all characters in S are considered\n\
11835printable in repr() or S is empty, False otherwise.");
11836
11837static PyObject*
11838unicode_isprintable(PyObject *self)
11839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 Py_ssize_t i, length;
11841 int kind;
11842 void *data;
11843
11844 if (PyUnicode_READY(self) == -1)
11845 return NULL;
11846 length = PyUnicode_GET_LENGTH(self);
11847 kind = PyUnicode_KIND(self);
11848 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011849
11850 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 if (length == 1)
11852 return PyBool_FromLong(
11853 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 for (i = 0; i < length; i++) {
11856 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011857 Py_RETURN_FALSE;
11858 }
11859 }
11860 Py_RETURN_TRUE;
11861}
11862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011863PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011864 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865\n\
11866Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011867iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868
11869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011870unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011872 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873}
11874
Martin v. Löwis18e16552006-02-15 17:27:45 +000011875static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011876unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 if (PyUnicode_READY(self) == -1)
11879 return -1;
11880 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881}
11882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011883PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011886Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011887done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
11889static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011890unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011892 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 Py_UCS4 fillchar = ' ';
11894
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011895 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896 return NULL;
11897
Benjamin Petersonbac79492012-01-14 13:34:47 -050011898 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
Victor Stinnerc4b49542011-12-11 22:44:26 +010011901 if (PyUnicode_GET_LENGTH(self) >= width)
11902 return unicode_result_unchanged(self);
11903
11904 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905}
11906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011907PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011910Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
11912static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011913unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011915 if (PyUnicode_READY(self) == -1)
11916 return NULL;
11917 if (PyUnicode_IS_ASCII(self))
11918 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011919 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920}
11921
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011922#define LEFTSTRIP 0
11923#define RIGHTSTRIP 1
11924#define BOTHSTRIP 2
11925
11926/* Arrays indexed by above */
11927static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11928
11929#define STRIPNAME(i) (stripformat[i]+3)
11930
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011931/* externally visible for str.strip(unicode) */
11932PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011933_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 void *data;
11936 int kind;
11937 Py_ssize_t i, j, len;
11938 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11941 return NULL;
11942
11943 kind = PyUnicode_KIND(self);
11944 data = PyUnicode_DATA(self);
11945 len = PyUnicode_GET_LENGTH(self);
11946 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11947 PyUnicode_DATA(sepobj),
11948 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011949
Benjamin Peterson14339b62009-01-31 16:36:08 +000011950 i = 0;
11951 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 while (i < len &&
11953 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 i++;
11955 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011956 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011957
Benjamin Peterson14339b62009-01-31 16:36:08 +000011958 j = len;
11959 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 do {
11961 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 } while (j >= i &&
11963 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011965 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011966
Victor Stinner7931d9a2011-11-04 00:22:48 +010011967 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968}
11969
11970PyObject*
11971PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11972{
11973 unsigned char *data;
11974 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011975 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976
Victor Stinnerde636f32011-10-01 03:55:54 +020011977 if (PyUnicode_READY(self) == -1)
11978 return NULL;
11979
11980 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11981
Victor Stinner12bab6d2011-10-01 01:53:49 +020011982 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011983 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984
Victor Stinner12bab6d2011-10-01 01:53:49 +020011985 length = end - start;
11986 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011987 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988
Victor Stinnerde636f32011-10-01 03:55:54 +020011989 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011990 PyErr_SetString(PyExc_IndexError, "string index out of range");
11991 return NULL;
11992 }
11993
Victor Stinnerb9275c12011-10-05 14:01:42 +020011994 if (PyUnicode_IS_ASCII(self)) {
11995 kind = PyUnicode_KIND(self);
11996 data = PyUnicode_1BYTE_DATA(self);
11997 return unicode_fromascii(data + start, length);
11998 }
11999 else {
12000 kind = PyUnicode_KIND(self);
12001 data = PyUnicode_1BYTE_DATA(self);
12002 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012003 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012004 length);
12005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
12008static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012009do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 int kind;
12012 void *data;
12013 Py_ssize_t len, i, j;
12014
12015 if (PyUnicode_READY(self) == -1)
12016 return NULL;
12017
12018 kind = PyUnicode_KIND(self);
12019 data = PyUnicode_DATA(self);
12020 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012021
Benjamin Peterson14339b62009-01-31 16:36:08 +000012022 i = 0;
12023 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012025 i++;
12026 }
12027 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012028
Benjamin Peterson14339b62009-01-31 16:36:08 +000012029 j = len;
12030 if (striptype != LEFTSTRIP) {
12031 do {
12032 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012034 j++;
12035 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012036
Victor Stinner7931d9a2011-11-04 00:22:48 +010012037 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038}
12039
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012040
12041static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012042do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012043{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012044 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012045
Benjamin Peterson14339b62009-01-31 16:36:08 +000012046 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12047 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012048
Benjamin Peterson14339b62009-01-31 16:36:08 +000012049 if (sep != NULL && sep != Py_None) {
12050 if (PyUnicode_Check(sep))
12051 return _PyUnicode_XStrip(self, striptype, sep);
12052 else {
12053 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012054 "%s arg must be None or str",
12055 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012056 return NULL;
12057 }
12058 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012059
Benjamin Peterson14339b62009-01-31 16:36:08 +000012060 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012061}
12062
12063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012064PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012066\n\
12067Return a copy of the string S with leading and trailing\n\
12068whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012069If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012070
12071static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012072unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012073{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012074 if (PyTuple_GET_SIZE(args) == 0)
12075 return do_strip(self, BOTHSTRIP); /* Common case */
12076 else
12077 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012078}
12079
12080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012081PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012082 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012083\n\
12084Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012085If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012086
12087static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012088unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012089{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012090 if (PyTuple_GET_SIZE(args) == 0)
12091 return do_strip(self, LEFTSTRIP); /* Common case */
12092 else
12093 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012094}
12095
12096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012097PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012099\n\
12100Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012101If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012102
12103static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012104unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012105{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012106 if (PyTuple_GET_SIZE(args) == 0)
12107 return do_strip(self, RIGHTSTRIP); /* Common case */
12108 else
12109 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012110}
12111
12112
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012114unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012116 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118
Georg Brandl222de0f2009-04-12 12:01:50 +000012119 if (len < 1) {
12120 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012121 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
Victor Stinnerc4b49542011-12-11 22:44:26 +010012124 /* no repeat, return original string */
12125 if (len == 1)
12126 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012127
Benjamin Petersonbac79492012-01-14 13:34:47 -050012128 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 return NULL;
12130
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012131 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012132 PyErr_SetString(PyExc_OverflowError,
12133 "repeated string is too long");
12134 return NULL;
12135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012137
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012138 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139 if (!u)
12140 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012141 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 if (PyUnicode_GET_LENGTH(str) == 1) {
12144 const int kind = PyUnicode_KIND(str);
12145 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012146 if (kind == PyUnicode_1BYTE_KIND) {
12147 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012148 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012149 }
12150 else if (kind == PyUnicode_2BYTE_KIND) {
12151 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012152 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012153 ucs2[n] = fill_char;
12154 } else {
12155 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12156 assert(kind == PyUnicode_4BYTE_KIND);
12157 for (n = 0; n < len; ++n)
12158 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 }
12161 else {
12162 /* number of characters copied this far */
12163 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012164 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 char *to = (char *) PyUnicode_DATA(u);
12166 Py_MEMCPY(to, PyUnicode_DATA(str),
12167 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012168 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 n = (done <= nchars-done) ? done : nchars-done;
12170 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012171 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173 }
12174
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012175 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012176 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177}
12178
Alexander Belopolsky40018472011-02-26 01:02:56 +000012179PyObject *
12180PyUnicode_Replace(PyObject *obj,
12181 PyObject *subobj,
12182 PyObject *replobj,
12183 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184{
12185 PyObject *self;
12186 PyObject *str1;
12187 PyObject *str2;
12188 PyObject *result;
12189
12190 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012191 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012192 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012194 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012195 Py_DECREF(self);
12196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197 }
12198 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012199 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012200 Py_DECREF(self);
12201 Py_DECREF(str1);
12202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012204 if (PyUnicode_READY(self) == -1 ||
12205 PyUnicode_READY(str1) == -1 ||
12206 PyUnicode_READY(str2) == -1)
12207 result = NULL;
12208 else
12209 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 Py_DECREF(self);
12211 Py_DECREF(str1);
12212 Py_DECREF(str2);
12213 return result;
12214}
12215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012216PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012217 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218\n\
12219Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012220old replaced by new. If the optional argument count is\n\
12221given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222
12223static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 PyObject *str1;
12227 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012228 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229 PyObject *result;
12230
Martin v. Löwis18e16552006-02-15 17:27:45 +000012231 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012233 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012236 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 return NULL;
12238 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012239 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012240 Py_DECREF(str1);
12241 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012242 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012243 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12244 result = NULL;
12245 else
12246 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247
12248 Py_DECREF(str1);
12249 Py_DECREF(str2);
12250 return result;
12251}
12252
Alexander Belopolsky40018472011-02-26 01:02:56 +000012253static PyObject *
12254unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012256 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 Py_ssize_t isize;
12258 Py_ssize_t osize, squote, dquote, i, o;
12259 Py_UCS4 max, quote;
12260 int ikind, okind;
12261 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012264 return NULL;
12265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 isize = PyUnicode_GET_LENGTH(unicode);
12267 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 /* Compute length of output, quote characters, and
12270 maximum character */
12271 osize = 2; /* quotes */
12272 max = 127;
12273 squote = dquote = 0;
12274 ikind = PyUnicode_KIND(unicode);
12275 for (i = 0; i < isize; i++) {
12276 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12277 switch (ch) {
12278 case '\'': squote++; osize++; break;
12279 case '"': dquote++; osize++; break;
12280 case '\\': case '\t': case '\r': case '\n':
12281 osize += 2; break;
12282 default:
12283 /* Fast-path ASCII */
12284 if (ch < ' ' || ch == 0x7f)
12285 osize += 4; /* \xHH */
12286 else if (ch < 0x7f)
12287 osize++;
12288 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12289 osize++;
12290 max = ch > max ? ch : max;
12291 }
12292 else if (ch < 0x100)
12293 osize += 4; /* \xHH */
12294 else if (ch < 0x10000)
12295 osize += 6; /* \uHHHH */
12296 else
12297 osize += 10; /* \uHHHHHHHH */
12298 }
12299 }
12300
12301 quote = '\'';
12302 if (squote) {
12303 if (dquote)
12304 /* Both squote and dquote present. Use squote,
12305 and escape them */
12306 osize += squote;
12307 else
12308 quote = '"';
12309 }
12310
12311 repr = PyUnicode_New(osize, max);
12312 if (repr == NULL)
12313 return NULL;
12314 okind = PyUnicode_KIND(repr);
12315 odata = PyUnicode_DATA(repr);
12316
12317 PyUnicode_WRITE(okind, odata, 0, quote);
12318 PyUnicode_WRITE(okind, odata, osize-1, quote);
12319
12320 for (i = 0, o = 1; i < isize; i++) {
12321 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012322
12323 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 if ((ch == quote) || (ch == '\\')) {
12325 PyUnicode_WRITE(okind, odata, o++, '\\');
12326 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012327 continue;
12328 }
12329
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012331 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 PyUnicode_WRITE(okind, odata, o++, '\\');
12333 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012334 }
12335 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 PyUnicode_WRITE(okind, odata, o++, '\\');
12337 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012338 }
12339 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 PyUnicode_WRITE(okind, odata, o++, '\\');
12341 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012342 }
12343
12344 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012345 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 PyUnicode_WRITE(okind, odata, o++, '\\');
12347 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012348 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12349 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012350 }
12351
Georg Brandl559e5d72008-06-11 18:37:52 +000012352 /* Copy ASCII characters as-is */
12353 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012355 }
12356
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012358 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012359 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012360 (categories Z* and C* except ASCII space)
12361 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012363 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 if (ch <= 0xff) {
12365 PyUnicode_WRITE(okind, odata, o++, '\\');
12366 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012367 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12368 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012369 }
12370 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 else if (ch >= 0x10000) {
12372 PyUnicode_WRITE(okind, odata, o++, '\\');
12373 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012374 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12375 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12376 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12377 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12378 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12379 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12380 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12381 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012382 }
12383 /* Map 16-bit characters to '\uxxxx' */
12384 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 PyUnicode_WRITE(okind, odata, o++, '\\');
12386 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012387 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12388 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12389 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12390 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012391 }
12392 }
12393 /* Copy characters as-is */
12394 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012396 }
12397 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012400 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012401 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402}
12403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012404PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406\n\
12407Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012408such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409arguments start and end are interpreted as in slice notation.\n\
12410\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012411Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412
12413static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012416 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012417 Py_ssize_t start;
12418 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420
Jesus Ceaac451502011-04-20 17:09:23 +020012421 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12422 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012423 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 if (PyUnicode_READY(self) == -1)
12426 return NULL;
12427 if (PyUnicode_READY(substring) == -1)
12428 return NULL;
12429
Victor Stinner7931d9a2011-11-04 00:22:48 +010012430 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431
12432 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 if (result == -2)
12435 return NULL;
12436
Christian Heimes217cfd12007-12-02 14:31:20 +000012437 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438}
12439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012440PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012441 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012443Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444
12445static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012448 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012449 Py_ssize_t start;
12450 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452
Jesus Ceaac451502011-04-20 17:09:23 +020012453 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12454 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 if (PyUnicode_READY(self) == -1)
12458 return NULL;
12459 if (PyUnicode_READY(substring) == -1)
12460 return NULL;
12461
Victor Stinner7931d9a2011-11-04 00:22:48 +010012462 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463
12464 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 if (result == -2)
12467 return NULL;
12468
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469 if (result < 0) {
12470 PyErr_SetString(PyExc_ValueError, "substring not found");
12471 return NULL;
12472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473
Christian Heimes217cfd12007-12-02 14:31:20 +000012474 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475}
12476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012477PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012480Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012481done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482
12483static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012484unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012486 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 Py_UCS4 fillchar = ' ';
12488
Victor Stinnere9a29352011-10-01 02:14:59 +020012489 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012491
Benjamin Petersonbac79492012-01-14 13:34:47 -050012492 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493 return NULL;
12494
Victor Stinnerc4b49542011-12-11 22:44:26 +010012495 if (PyUnicode_GET_LENGTH(self) >= width)
12496 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497
Victor Stinnerc4b49542011-12-11 22:44:26 +010012498 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499}
12500
Alexander Belopolsky40018472011-02-26 01:02:56 +000012501PyObject *
12502PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
12504 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012505
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506 s = PyUnicode_FromObject(s);
12507 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012508 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012509 if (sep != NULL) {
12510 sep = PyUnicode_FromObject(sep);
12511 if (sep == NULL) {
12512 Py_DECREF(s);
12513 return NULL;
12514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515 }
12516
Victor Stinner9310abb2011-10-05 00:59:23 +020012517 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518
12519 Py_DECREF(s);
12520 Py_XDECREF(sep);
12521 return result;
12522}
12523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012524PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012525 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526\n\
12527Return a list of the words in S, using sep as the\n\
12528delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012529splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012530whitespace string is a separator and empty strings are\n\
12531removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
12533static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012534unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012536 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012538 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012540 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12541 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542 return NULL;
12543
12544 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012545 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012547 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012549 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550}
12551
Thomas Wouters477c8d52006-05-27 19:21:47 +000012552PyObject *
12553PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12554{
12555 PyObject* str_obj;
12556 PyObject* sep_obj;
12557 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 int kind1, kind2, kind;
12559 void *buf1 = NULL, *buf2 = NULL;
12560 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012561
12562 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012563 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012565 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012566 if (!sep_obj) {
12567 Py_DECREF(str_obj);
12568 return NULL;
12569 }
12570 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12571 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012572 Py_DECREF(str_obj);
12573 return NULL;
12574 }
12575
Victor Stinner14f8f022011-10-05 20:58:25 +020012576 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012578 kind = Py_MAX(kind1, kind2);
12579 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012581 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 if (!buf1)
12583 goto onError;
12584 buf2 = PyUnicode_DATA(sep_obj);
12585 if (kind2 != kind)
12586 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12587 if (!buf2)
12588 goto onError;
12589 len1 = PyUnicode_GET_LENGTH(str_obj);
12590 len2 = PyUnicode_GET_LENGTH(sep_obj);
12591
Benjamin Petersonead6b532011-12-20 17:23:42 -060012592 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012594 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12595 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12596 else
12597 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 break;
12599 case PyUnicode_2BYTE_KIND:
12600 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12601 break;
12602 case PyUnicode_4BYTE_KIND:
12603 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12604 break;
12605 default:
12606 assert(0);
12607 out = 0;
12608 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012609
12610 Py_DECREF(sep_obj);
12611 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 if (kind1 != kind)
12613 PyMem_Free(buf1);
12614 if (kind2 != kind)
12615 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012616
12617 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 onError:
12619 Py_DECREF(sep_obj);
12620 Py_DECREF(str_obj);
12621 if (kind1 != kind && buf1)
12622 PyMem_Free(buf1);
12623 if (kind2 != kind && buf2)
12624 PyMem_Free(buf2);
12625 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012626}
12627
12628
12629PyObject *
12630PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12631{
12632 PyObject* str_obj;
12633 PyObject* sep_obj;
12634 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 int kind1, kind2, kind;
12636 void *buf1 = NULL, *buf2 = NULL;
12637 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012638
12639 str_obj = PyUnicode_FromObject(str_in);
12640 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012642 sep_obj = PyUnicode_FromObject(sep_in);
12643 if (!sep_obj) {
12644 Py_DECREF(str_obj);
12645 return NULL;
12646 }
12647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 kind1 = PyUnicode_KIND(str_in);
12649 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012650 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 buf1 = PyUnicode_DATA(str_in);
12652 if (kind1 != kind)
12653 buf1 = _PyUnicode_AsKind(str_in, kind);
12654 if (!buf1)
12655 goto onError;
12656 buf2 = PyUnicode_DATA(sep_obj);
12657 if (kind2 != kind)
12658 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12659 if (!buf2)
12660 goto onError;
12661 len1 = PyUnicode_GET_LENGTH(str_obj);
12662 len2 = PyUnicode_GET_LENGTH(sep_obj);
12663
Benjamin Petersonead6b532011-12-20 17:23:42 -060012664 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012666 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12667 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12668 else
12669 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 break;
12671 case PyUnicode_2BYTE_KIND:
12672 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12673 break;
12674 case PyUnicode_4BYTE_KIND:
12675 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12676 break;
12677 default:
12678 assert(0);
12679 out = 0;
12680 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012681
12682 Py_DECREF(sep_obj);
12683 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 if (kind1 != kind)
12685 PyMem_Free(buf1);
12686 if (kind2 != kind)
12687 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012688
12689 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 onError:
12691 Py_DECREF(sep_obj);
12692 Py_DECREF(str_obj);
12693 if (kind1 != kind && buf1)
12694 PyMem_Free(buf1);
12695 if (kind2 != kind && buf2)
12696 PyMem_Free(buf2);
12697 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012698}
12699
12700PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012701 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012702\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012703Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012704the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012705found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012706
12707static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012708unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012709{
Victor Stinner9310abb2011-10-05 00:59:23 +020012710 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711}
12712
12713PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012714 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012715\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012716Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012717the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012718separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012719
12720static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012721unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012722{
Victor Stinner9310abb2011-10-05 00:59:23 +020012723 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012724}
12725
Alexander Belopolsky40018472011-02-26 01:02:56 +000012726PyObject *
12727PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012728{
12729 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012730
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012731 s = PyUnicode_FromObject(s);
12732 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012733 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012734 if (sep != NULL) {
12735 sep = PyUnicode_FromObject(sep);
12736 if (sep == NULL) {
12737 Py_DECREF(s);
12738 return NULL;
12739 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012740 }
12741
Victor Stinner9310abb2011-10-05 00:59:23 +020012742 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012743
12744 Py_DECREF(s);
12745 Py_XDECREF(sep);
12746 return result;
12747}
12748
12749PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012750 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012751\n\
12752Return a list of the words in S, using sep as the\n\
12753delimiter string, starting at the end of the string and\n\
12754working to the front. If maxsplit is given, at most maxsplit\n\
12755splits are done. If sep is not specified, any whitespace string\n\
12756is a separator.");
12757
12758static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012759unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012760{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012761 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012762 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012763 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012764
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012765 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12766 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012767 return NULL;
12768
12769 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012771 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012772 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012773 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012774 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012775}
12776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012777PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012778 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779\n\
12780Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012781Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012782is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783
12784static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012785unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012787 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012788 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012790 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12791 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792 return NULL;
12793
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012794 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795}
12796
12797static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012798PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012800 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801}
12802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012803PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012804 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805\n\
12806Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012807and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808
12809static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012810unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012812 if (PyUnicode_READY(self) == -1)
12813 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012814 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815}
12816
Georg Brandlceee0772007-11-27 23:48:05 +000012817PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012819\n\
12820Return a translation table usable for str.translate().\n\
12821If there is only one argument, it must be a dictionary mapping Unicode\n\
12822ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012823Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012824If there are two arguments, they must be strings of equal length, and\n\
12825in the resulting dictionary, each character in x will be mapped to the\n\
12826character at the same position in y. If there is a third argument, it\n\
12827must be a string, whose characters will be mapped to None in the result.");
12828
12829static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012830unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012831{
12832 PyObject *x, *y = NULL, *z = NULL;
12833 PyObject *new = NULL, *key, *value;
12834 Py_ssize_t i = 0;
12835 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012836
Georg Brandlceee0772007-11-27 23:48:05 +000012837 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12838 return NULL;
12839 new = PyDict_New();
12840 if (!new)
12841 return NULL;
12842 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843 int x_kind, y_kind, z_kind;
12844 void *x_data, *y_data, *z_data;
12845
Georg Brandlceee0772007-11-27 23:48:05 +000012846 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012847 if (!PyUnicode_Check(x)) {
12848 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12849 "be a string if there is a second argument");
12850 goto err;
12851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012853 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12854 "arguments must have equal length");
12855 goto err;
12856 }
12857 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012858 x_kind = PyUnicode_KIND(x);
12859 y_kind = PyUnicode_KIND(y);
12860 x_data = PyUnicode_DATA(x);
12861 y_data = PyUnicode_DATA(y);
12862 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12863 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012864 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012865 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012866 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012867 if (!value) {
12868 Py_DECREF(key);
12869 goto err;
12870 }
Georg Brandlceee0772007-11-27 23:48:05 +000012871 res = PyDict_SetItem(new, key, value);
12872 Py_DECREF(key);
12873 Py_DECREF(value);
12874 if (res < 0)
12875 goto err;
12876 }
12877 /* create entries for deleting chars in z */
12878 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 z_kind = PyUnicode_KIND(z);
12880 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012881 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012882 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012883 if (!key)
12884 goto err;
12885 res = PyDict_SetItem(new, key, Py_None);
12886 Py_DECREF(key);
12887 if (res < 0)
12888 goto err;
12889 }
12890 }
12891 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 int kind;
12893 void *data;
12894
Georg Brandlceee0772007-11-27 23:48:05 +000012895 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012896 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012897 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12898 "to maketrans it must be a dict");
12899 goto err;
12900 }
12901 /* copy entries into the new dict, converting string keys to int keys */
12902 while (PyDict_Next(x, &i, &key, &value)) {
12903 if (PyUnicode_Check(key)) {
12904 /* convert string keys to integer keys */
12905 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012906 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012907 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12908 "table must be of length 1");
12909 goto err;
12910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 kind = PyUnicode_KIND(key);
12912 data = PyUnicode_DATA(key);
12913 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012914 if (!newkey)
12915 goto err;
12916 res = PyDict_SetItem(new, newkey, value);
12917 Py_DECREF(newkey);
12918 if (res < 0)
12919 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012920 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012921 /* just keep integer keys */
12922 if (PyDict_SetItem(new, key, value) < 0)
12923 goto err;
12924 } else {
12925 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12926 "be strings or integers");
12927 goto err;
12928 }
12929 }
12930 }
12931 return new;
12932 err:
12933 Py_DECREF(new);
12934 return NULL;
12935}
12936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012937PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939\n\
12940Return a copy of the string S, where all characters have been mapped\n\
12941through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012942Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012943Unmapped characters are left untouched. Characters mapped to None\n\
12944are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945
12946static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950}
12951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012952PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012953 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012955Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956
12957static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012958unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012960 if (PyUnicode_READY(self) == -1)
12961 return NULL;
12962 if (PyUnicode_IS_ASCII(self))
12963 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012964 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965}
12966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012967PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012970Pad a numeric string S with zeros on the left, to fill a field\n\
12971of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012972
12973static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012974unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012975{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012976 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012977 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012978 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 int kind;
12980 void *data;
12981 Py_UCS4 chr;
12982
Martin v. Löwis18e16552006-02-15 17:27:45 +000012983 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984 return NULL;
12985
Benjamin Petersonbac79492012-01-14 13:34:47 -050012986 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012987 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988
Victor Stinnerc4b49542011-12-11 22:44:26 +010012989 if (PyUnicode_GET_LENGTH(self) >= width)
12990 return unicode_result_unchanged(self);
12991
12992 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993
12994 u = pad(self, fill, 0, '0');
12995
Walter Dörwald068325e2002-04-15 13:36:47 +000012996 if (u == NULL)
12997 return NULL;
12998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 kind = PyUnicode_KIND(u);
13000 data = PyUnicode_DATA(u);
13001 chr = PyUnicode_READ(kind, data, fill);
13002
13003 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013004 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 PyUnicode_WRITE(kind, data, 0, chr);
13006 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007 }
13008
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013009 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013010 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013012
13013#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013014static PyObject *
13015unicode__decimal2ascii(PyObject *self)
13016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013018}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019#endif
13020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013021PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013022 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013024Return True if S starts with the specified prefix, False otherwise.\n\
13025With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013026With optional end, stop comparing S at that position.\n\
13027prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028
13029static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013030unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013031 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013033 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013034 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013035 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013036 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013037 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038
Jesus Ceaac451502011-04-20 17:09:23 +020013039 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013040 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013041 if (PyTuple_Check(subobj)) {
13042 Py_ssize_t i;
13043 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013044 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013045 if (substring == NULL)
13046 return NULL;
13047 result = tailmatch(self, substring, start, end, -1);
13048 Py_DECREF(substring);
13049 if (result) {
13050 Py_RETURN_TRUE;
13051 }
13052 }
13053 /* nothing matched */
13054 Py_RETURN_FALSE;
13055 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013056 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013057 if (substring == NULL) {
13058 if (PyErr_ExceptionMatches(PyExc_TypeError))
13059 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13060 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013061 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013062 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013063 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013065 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066}
13067
13068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013069PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013072Return True if S ends with the specified suffix, False otherwise.\n\
13073With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013074With optional end, stop comparing S at that position.\n\
13075suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076
13077static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013078unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013079 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013081 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013082 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013083 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013084 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013085 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086
Jesus Ceaac451502011-04-20 17:09:23 +020013087 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013088 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013089 if (PyTuple_Check(subobj)) {
13090 Py_ssize_t i;
13091 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013092 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013093 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013094 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013095 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013096 result = tailmatch(self, substring, start, end, +1);
13097 Py_DECREF(substring);
13098 if (result) {
13099 Py_RETURN_TRUE;
13100 }
13101 }
13102 Py_RETURN_FALSE;
13103 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013104 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013105 if (substring == NULL) {
13106 if (PyErr_ExceptionMatches(PyExc_TypeError))
13107 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13108 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013110 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013111 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013113 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114}
13115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013116#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013117
13118PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013119 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013120\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013121Return a formatted version of S, using substitutions from args and kwargs.\n\
13122The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013123
Eric Smith27bbca62010-11-04 17:06:58 +000013124PyDoc_STRVAR(format_map__doc__,
13125 "S.format_map(mapping) -> str\n\
13126\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013127Return a formatted version of S, using substitutions from mapping.\n\
13128The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013129
Eric Smith4a7d76d2008-05-30 18:10:19 +000013130static PyObject *
13131unicode__format__(PyObject* self, PyObject* args)
13132{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013133 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013134
13135 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13136 return NULL;
13137
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013138 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013140 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013141}
13142
Eric Smith8c663262007-08-25 02:26:07 +000013143PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013145\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013146Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013147
13148static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013149unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013150{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013151 Py_ssize_t size;
13152
13153 /* If it's a compact object, account for base structure +
13154 character data. */
13155 if (PyUnicode_IS_COMPACT_ASCII(v))
13156 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13157 else if (PyUnicode_IS_COMPACT(v))
13158 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013159 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 else {
13161 /* If it is a two-block object, account for base object, and
13162 for character block if present. */
13163 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013164 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013166 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167 }
13168 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013169 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013170 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013172 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013173 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013174
13175 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013176}
13177
13178PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013180
13181static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013182unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013183{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013184 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185 if (!copy)
13186 return NULL;
13187 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013188}
13189
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013191 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013192 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013193 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13194 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013195 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13196 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013197 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013198 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13199 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13200 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13201 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13202 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013203 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013204 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13205 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13206 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013207 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013208 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13209 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13210 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013211 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013212 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013213 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013214 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013215 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13216 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13217 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13218 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13219 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13220 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13221 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13222 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13223 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13224 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13225 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13226 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13227 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13228 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013229 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013230 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013231 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013232 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013233 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013234 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013235 {"maketrans", (PyCFunction) unicode_maketrans,
13236 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013237 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013238#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013239 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013240 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241#endif
13242
Benjamin Peterson14339b62009-01-31 16:36:08 +000013243 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244 {NULL, NULL}
13245};
13246
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013247static PyObject *
13248unicode_mod(PyObject *v, PyObject *w)
13249{
Brian Curtindfc80e32011-08-10 20:28:54 -050013250 if (!PyUnicode_Check(v))
13251 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013252 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013253}
13254
13255static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013256 0, /*nb_add*/
13257 0, /*nb_subtract*/
13258 0, /*nb_multiply*/
13259 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013260};
13261
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013263 (lenfunc) unicode_length, /* sq_length */
13264 PyUnicode_Concat, /* sq_concat */
13265 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13266 (ssizeargfunc) unicode_getitem, /* sq_item */
13267 0, /* sq_slice */
13268 0, /* sq_ass_item */
13269 0, /* sq_ass_slice */
13270 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271};
13272
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013273static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013274unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276 if (PyUnicode_READY(self) == -1)
13277 return NULL;
13278
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013279 if (PyIndex_Check(item)) {
13280 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013281 if (i == -1 && PyErr_Occurred())
13282 return NULL;
13283 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013284 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013285 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013286 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013287 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013288 PyObject *result;
13289 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013290 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013291 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013294 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013295 return NULL;
13296 }
13297
13298 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013299 Py_INCREF(unicode_empty);
13300 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013302 slicelength == PyUnicode_GET_LENGTH(self)) {
13303 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013304 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013305 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013306 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013307 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013308 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013309 src_kind = PyUnicode_KIND(self);
13310 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013311 if (!PyUnicode_IS_ASCII(self)) {
13312 kind_limit = kind_maxchar_limit(src_kind);
13313 max_char = 0;
13314 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13315 ch = PyUnicode_READ(src_kind, src_data, cur);
13316 if (ch > max_char) {
13317 max_char = ch;
13318 if (max_char >= kind_limit)
13319 break;
13320 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013321 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013322 }
Victor Stinner55c99112011-10-13 01:17:06 +020013323 else
13324 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013325 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013326 if (result == NULL)
13327 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013328 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013329 dest_data = PyUnicode_DATA(result);
13330
13331 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013332 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13333 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013334 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013335 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013336 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013337 } else {
13338 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13339 return NULL;
13340 }
13341}
13342
13343static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013344 (lenfunc)unicode_length, /* mp_length */
13345 (binaryfunc)unicode_subscript, /* mp_subscript */
13346 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013347};
13348
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350/* Helpers for PyUnicode_Format() */
13351
13352static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013353getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013354{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013355 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013357 (*p_argidx)++;
13358 if (arglen < 0)
13359 return args;
13360 else
13361 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362 }
13363 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013364 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365 return NULL;
13366}
13367
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013368/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013369
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013370static PyObject *
13371formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013373 char *p;
13374 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013376
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377 x = PyFloat_AsDouble(v);
13378 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013379 return NULL;
13380
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013383
Eric Smith0923d1d2009-04-16 20:16:10 +000013384 p = PyOS_double_to_string(x, type, prec,
13385 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013386 if (p == NULL)
13387 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013388 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013389 PyMem_Free(p);
13390 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391}
13392
Tim Peters38fd5b62000-09-21 05:43:11 +000013393static PyObject*
13394formatlong(PyObject *val, int flags, int prec, int type)
13395{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013396 char *buf;
13397 int len;
13398 PyObject *str; /* temporary string object. */
13399 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013400
Benjamin Peterson14339b62009-01-31 16:36:08 +000013401 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13402 if (!str)
13403 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013404 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013405 Py_DECREF(str);
13406 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013407}
13408
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013409static Py_UCS4
13410formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013412 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013413 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013414 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013415 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 goto onError;
13418 }
13419 else {
13420 /* Integer input truncated to a character */
13421 long x;
13422 x = PyLong_AsLong(v);
13423 if (x == -1 && PyErr_Occurred())
13424 goto onError;
13425
Victor Stinner8faf8212011-12-08 22:14:11 +010013426 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 PyErr_SetString(PyExc_OverflowError,
13428 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013429 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 }
13431
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013432 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013433 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013434
Benjamin Peterson29060642009-01-31 22:14:21 +000013435 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013436 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013438 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439}
13440
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013441static int
13442repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13443{
13444 int r;
13445 assert(count > 0);
13446 assert(PyUnicode_Check(obj));
13447 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013448 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013449 if (repeated == NULL)
13450 return -1;
13451 r = _PyAccu_Accumulate(acc, repeated);
13452 Py_DECREF(repeated);
13453 return r;
13454 }
13455 else {
13456 do {
13457 if (_PyAccu_Accumulate(acc, obj))
13458 return -1;
13459 } while (--count);
13460 return 0;
13461 }
13462}
13463
Alexander Belopolsky40018472011-02-26 01:02:56 +000013464PyObject *
13465PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013467 void *fmt;
13468 int fmtkind;
13469 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013470 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013471 int r;
13472 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013473 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013474 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013475 PyObject *temp = NULL;
13476 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013477 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013478 _PyAccu acc;
13479 static PyObject *plus, *minus, *blank, *zero, *percent;
13480
13481 if (!plus && !(plus = get_latin1_char('+')))
13482 return NULL;
13483 if (!minus && !(minus = get_latin1_char('-')))
13484 return NULL;
13485 if (!blank && !(blank = get_latin1_char(' ')))
13486 return NULL;
13487 if (!zero && !(zero = get_latin1_char('0')))
13488 return NULL;
13489 if (!percent && !(percent = get_latin1_char('%')))
13490 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013491
Guido van Rossumd57fd912000-03-10 22:53:23 +000013492 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013493 PyErr_BadInternalCall();
13494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013495 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013496 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013497 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013498 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013499 if (PyUnicode_READY(uformat) == -1)
13500 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013501 if (_PyAccu_Init(&acc))
13502 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 fmt = PyUnicode_DATA(uformat);
13504 fmtkind = PyUnicode_KIND(uformat);
13505 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13506 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013507
Guido van Rossumd57fd912000-03-10 22:53:23 +000013508 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 arglen = PyTuple_Size(args);
13510 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013511 }
13512 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013513 arglen = -1;
13514 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013515 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013516 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013517 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013518 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013519
13520 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013521 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013522 PyObject *nonfmt;
13523 Py_ssize_t nonfmtpos;
13524 nonfmtpos = fmtpos++;
13525 while (fmtcnt >= 0 &&
13526 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13527 fmtpos++;
13528 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013529 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013530 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013531 if (nonfmt == NULL)
13532 goto onError;
13533 r = _PyAccu_Accumulate(&acc, nonfmt);
13534 Py_DECREF(nonfmt);
13535 if (r)
13536 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013537 }
13538 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013539 /* Got a format specifier */
13540 int flags = 0;
13541 Py_ssize_t width = -1;
13542 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013543 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013544 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 int isnumok;
13546 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013547 void *pbuf = NULL;
13548 Py_ssize_t pindex, len;
13549 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013551 fmtpos++;
13552 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13553 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 Py_ssize_t keylen;
13555 PyObject *key;
13556 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013557
Benjamin Peterson29060642009-01-31 22:14:21 +000013558 if (dict == NULL) {
13559 PyErr_SetString(PyExc_TypeError,
13560 "format requires a mapping");
13561 goto onError;
13562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013565 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 /* Skip over balanced parentheses */
13567 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013568 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013570 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013574 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 if (fmtcnt < 0 || pcount > 0) {
13576 PyErr_SetString(PyExc_ValueError,
13577 "incomplete format key");
13578 goto onError;
13579 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013580 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013581 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 if (key == NULL)
13583 goto onError;
13584 if (args_owned) {
13585 Py_DECREF(args);
13586 args_owned = 0;
13587 }
13588 args = PyObject_GetItem(dict, key);
13589 Py_DECREF(key);
13590 if (args == NULL) {
13591 goto onError;
13592 }
13593 args_owned = 1;
13594 arglen = -1;
13595 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013596 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013598 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013599 case '-': flags |= F_LJUST; continue;
13600 case '+': flags |= F_SIGN; continue;
13601 case ' ': flags |= F_BLANK; continue;
13602 case '#': flags |= F_ALT; continue;
13603 case '0': flags |= F_ZERO; continue;
13604 }
13605 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013606 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 if (c == '*') {
13608 v = getnextarg(args, arglen, &argidx);
13609 if (v == NULL)
13610 goto onError;
13611 if (!PyLong_Check(v)) {
13612 PyErr_SetString(PyExc_TypeError,
13613 "* wants int");
13614 goto onError;
13615 }
13616 width = PyLong_AsLong(v);
13617 if (width == -1 && PyErr_Occurred())
13618 goto onError;
13619 if (width < 0) {
13620 flags |= F_LJUST;
13621 width = -width;
13622 }
13623 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013624 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013625 }
13626 else if (c >= '0' && c <= '9') {
13627 width = c - '0';
13628 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013629 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013630 if (c < '0' || c > '9')
13631 break;
13632 if ((width*10) / 10 != width) {
13633 PyErr_SetString(PyExc_ValueError,
13634 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013635 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 }
13637 width = width*10 + (c - '0');
13638 }
13639 }
13640 if (c == '.') {
13641 prec = 0;
13642 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013643 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 if (c == '*') {
13645 v = getnextarg(args, arglen, &argidx);
13646 if (v == NULL)
13647 goto onError;
13648 if (!PyLong_Check(v)) {
13649 PyErr_SetString(PyExc_TypeError,
13650 "* wants int");
13651 goto onError;
13652 }
13653 prec = PyLong_AsLong(v);
13654 if (prec == -1 && PyErr_Occurred())
13655 goto onError;
13656 if (prec < 0)
13657 prec = 0;
13658 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013659 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 }
13661 else if (c >= '0' && c <= '9') {
13662 prec = c - '0';
13663 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013664 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013665 if (c < '0' || c > '9')
13666 break;
13667 if ((prec*10) / 10 != prec) {
13668 PyErr_SetString(PyExc_ValueError,
13669 "prec too big");
13670 goto onError;
13671 }
13672 prec = prec*10 + (c - '0');
13673 }
13674 }
13675 } /* prec */
13676 if (fmtcnt >= 0) {
13677 if (c == 'h' || c == 'l' || c == 'L') {
13678 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013679 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013680 }
13681 }
13682 if (fmtcnt < 0) {
13683 PyErr_SetString(PyExc_ValueError,
13684 "incomplete format");
13685 goto onError;
13686 }
13687 if (c != '%') {
13688 v = getnextarg(args, arglen, &argidx);
13689 if (v == NULL)
13690 goto onError;
13691 }
13692 sign = 0;
13693 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013694 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013695 switch (c) {
13696
13697 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013698 _PyAccu_Accumulate(&acc, percent);
13699 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013700
13701 case 's':
13702 case 'r':
13703 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013704 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 temp = v;
13706 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013707 }
13708 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013709 if (c == 's')
13710 temp = PyObject_Str(v);
13711 else if (c == 'r')
13712 temp = PyObject_Repr(v);
13713 else
13714 temp = PyObject_ASCII(v);
13715 if (temp == NULL)
13716 goto onError;
13717 if (PyUnicode_Check(temp))
13718 /* nothing to do */;
13719 else {
13720 Py_DECREF(temp);
13721 PyErr_SetString(PyExc_TypeError,
13722 "%s argument has non-string str()");
13723 goto onError;
13724 }
13725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013726 if (PyUnicode_READY(temp) == -1) {
13727 Py_CLEAR(temp);
13728 goto onError;
13729 }
13730 pbuf = PyUnicode_DATA(temp);
13731 kind = PyUnicode_KIND(temp);
13732 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 if (prec >= 0 && len > prec)
13734 len = prec;
13735 break;
13736
13737 case 'i':
13738 case 'd':
13739 case 'u':
13740 case 'o':
13741 case 'x':
13742 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 isnumok = 0;
13744 if (PyNumber_Check(v)) {
13745 PyObject *iobj=NULL;
13746
13747 if (PyLong_Check(v)) {
13748 iobj = v;
13749 Py_INCREF(iobj);
13750 }
13751 else {
13752 iobj = PyNumber_Long(v);
13753 }
13754 if (iobj!=NULL) {
13755 if (PyLong_Check(iobj)) {
13756 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013757 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 Py_DECREF(iobj);
13759 if (!temp)
13760 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013761 if (PyUnicode_READY(temp) == -1) {
13762 Py_CLEAR(temp);
13763 goto onError;
13764 }
13765 pbuf = PyUnicode_DATA(temp);
13766 kind = PyUnicode_KIND(temp);
13767 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013768 sign = 1;
13769 }
13770 else {
13771 Py_DECREF(iobj);
13772 }
13773 }
13774 }
13775 if (!isnumok) {
13776 PyErr_Format(PyExc_TypeError,
13777 "%%%c format: a number is required, "
13778 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13779 goto onError;
13780 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013781 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013782 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013783 fillobj = zero;
13784 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013785 break;
13786
13787 case 'e':
13788 case 'E':
13789 case 'f':
13790 case 'F':
13791 case 'g':
13792 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013793 temp = formatfloat(v, flags, prec, c);
13794 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 if (PyUnicode_READY(temp) == -1) {
13797 Py_CLEAR(temp);
13798 goto onError;
13799 }
13800 pbuf = PyUnicode_DATA(temp);
13801 kind = PyUnicode_KIND(temp);
13802 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013803 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013804 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013806 fillobj = zero;
13807 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 break;
13809
13810 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013811 {
13812 Py_UCS4 ch = formatchar(v);
13813 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013815 temp = _PyUnicode_FromUCS4(&ch, 1);
13816 if (temp == NULL)
13817 goto onError;
13818 pbuf = PyUnicode_DATA(temp);
13819 kind = PyUnicode_KIND(temp);
13820 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013822 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013823
13824 default:
13825 PyErr_Format(PyExc_ValueError,
13826 "unsupported format character '%c' (0x%x) "
13827 "at index %zd",
13828 (31<=c && c<=126) ? (char)c : '?',
13829 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013830 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 goto onError;
13832 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013833 /* pbuf is initialized here. */
13834 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013835 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013836 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13837 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013838 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013839 pindex++;
13840 }
13841 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13842 signobj = plus;
13843 len--;
13844 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013845 }
13846 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013847 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013849 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 else
13851 sign = 0;
13852 }
13853 if (width < len)
13854 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013855 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013856 if (fill != ' ') {
13857 assert(signobj != NULL);
13858 if (_PyAccu_Accumulate(&acc, signobj))
13859 goto onError;
13860 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013861 if (width > len)
13862 width--;
13863 }
13864 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013865 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013866 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013867 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013868 second = get_latin1_char(
13869 PyUnicode_READ(kind, pbuf, pindex + 1));
13870 pindex += 2;
13871 if (second == NULL ||
13872 _PyAccu_Accumulate(&acc, zero) ||
13873 _PyAccu_Accumulate(&acc, second))
13874 goto onError;
13875 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013876 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013877 width -= 2;
13878 if (width < 0)
13879 width = 0;
13880 len -= 2;
13881 }
13882 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013883 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013884 if (repeat_accumulate(&acc, fillobj, width - len))
13885 goto onError;
13886 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013887 }
13888 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013889 if (sign) {
13890 assert(signobj != NULL);
13891 if (_PyAccu_Accumulate(&acc, signobj))
13892 goto onError;
13893 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013894 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013895 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13896 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013897 second = get_latin1_char(
13898 PyUnicode_READ(kind, pbuf, pindex + 1));
13899 pindex += 2;
13900 if (second == NULL ||
13901 _PyAccu_Accumulate(&acc, zero) ||
13902 _PyAccu_Accumulate(&acc, second))
13903 goto onError;
13904 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013905 }
13906 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013907 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013908 if (temp != NULL) {
13909 assert(pbuf == PyUnicode_DATA(temp));
13910 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013911 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013912 else {
13913 const char *p = (const char *) pbuf;
13914 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013915 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013916 v = PyUnicode_FromKindAndData(kind, p, len);
13917 }
13918 if (v == NULL)
13919 goto onError;
13920 r = _PyAccu_Accumulate(&acc, v);
13921 Py_DECREF(v);
13922 if (r)
13923 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013924 if (width > len && repeat_accumulate(&acc, blank, width - len))
13925 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013926 if (dict && (argidx < arglen) && c != '%') {
13927 PyErr_SetString(PyExc_TypeError,
13928 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013929 goto onError;
13930 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013931 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013933 } /* until end */
13934 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013935 PyErr_SetString(PyExc_TypeError,
13936 "not all arguments converted during string formatting");
13937 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013938 }
13939
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013940 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013941 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013942 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013943 }
13944 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013945 Py_XDECREF(temp);
13946 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013947 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948
Benjamin Peterson29060642009-01-31 22:14:21 +000013949 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013950 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013951 Py_XDECREF(temp);
13952 Py_XDECREF(second);
13953 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013954 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013955 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956 }
13957 return NULL;
13958}
13959
Jeremy Hylton938ace62002-07-17 16:30:39 +000013960static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013961unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13962
Tim Peters6d6c1a32001-08-02 04:15:00 +000013963static PyObject *
13964unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13965{
Benjamin Peterson29060642009-01-31 22:14:21 +000013966 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013967 static char *kwlist[] = {"object", "encoding", "errors", 0};
13968 char *encoding = NULL;
13969 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013970
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 if (type != &PyUnicode_Type)
13972 return unicode_subtype_new(type, args, kwds);
13973 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013974 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013975 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013976 if (x == NULL) {
13977 Py_INCREF(unicode_empty);
13978 return unicode_empty;
13979 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 if (encoding == NULL && errors == NULL)
13981 return PyObject_Str(x);
13982 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013983 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013984}
13985
Guido van Rossume023fe02001-08-30 03:12:59 +000013986static PyObject *
13987unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13988{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013989 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013990 Py_ssize_t length, char_size;
13991 int share_wstr, share_utf8;
13992 unsigned int kind;
13993 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013994
Benjamin Peterson14339b62009-01-31 16:36:08 +000013995 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013996
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013997 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013998 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013999 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014000 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014001 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014002 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014003 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014004 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014005
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014006 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014007 if (self == NULL) {
14008 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014009 return NULL;
14010 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014011 kind = PyUnicode_KIND(unicode);
14012 length = PyUnicode_GET_LENGTH(unicode);
14013
14014 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014015#ifdef Py_DEBUG
14016 _PyUnicode_HASH(self) = -1;
14017#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014018 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014019#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014020 _PyUnicode_STATE(self).interned = 0;
14021 _PyUnicode_STATE(self).kind = kind;
14022 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014023 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014024 _PyUnicode_STATE(self).ready = 1;
14025 _PyUnicode_WSTR(self) = NULL;
14026 _PyUnicode_UTF8_LENGTH(self) = 0;
14027 _PyUnicode_UTF8(self) = NULL;
14028 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014029 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014030
14031 share_utf8 = 0;
14032 share_wstr = 0;
14033 if (kind == PyUnicode_1BYTE_KIND) {
14034 char_size = 1;
14035 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14036 share_utf8 = 1;
14037 }
14038 else if (kind == PyUnicode_2BYTE_KIND) {
14039 char_size = 2;
14040 if (sizeof(wchar_t) == 2)
14041 share_wstr = 1;
14042 }
14043 else {
14044 assert(kind == PyUnicode_4BYTE_KIND);
14045 char_size = 4;
14046 if (sizeof(wchar_t) == 4)
14047 share_wstr = 1;
14048 }
14049
14050 /* Ensure we won't overflow the length. */
14051 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14052 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014053 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014055 data = PyObject_MALLOC((length + 1) * char_size);
14056 if (data == NULL) {
14057 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014058 goto onError;
14059 }
14060
Victor Stinnerc3c74152011-10-02 20:39:55 +020014061 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014062 if (share_utf8) {
14063 _PyUnicode_UTF8_LENGTH(self) = length;
14064 _PyUnicode_UTF8(self) = data;
14065 }
14066 if (share_wstr) {
14067 _PyUnicode_WSTR_LENGTH(self) = length;
14068 _PyUnicode_WSTR(self) = (wchar_t *)data;
14069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014070
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014071 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014072 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014073 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014074#ifdef Py_DEBUG
14075 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14076#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014077 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014078 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014079
14080onError:
14081 Py_DECREF(unicode);
14082 Py_DECREF(self);
14083 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014084}
14085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014086PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014087 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014088\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014089Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014090encoding defaults to the current default string encoding.\n\
14091errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014092
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014093static PyObject *unicode_iter(PyObject *seq);
14094
Guido van Rossumd57fd912000-03-10 22:53:23 +000014095PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014096 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014097 "str", /* tp_name */
14098 sizeof(PyUnicodeObject), /* tp_size */
14099 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014100 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014101 (destructor)unicode_dealloc, /* tp_dealloc */
14102 0, /* tp_print */
14103 0, /* tp_getattr */
14104 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014105 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014106 unicode_repr, /* tp_repr */
14107 &unicode_as_number, /* tp_as_number */
14108 &unicode_as_sequence, /* tp_as_sequence */
14109 &unicode_as_mapping, /* tp_as_mapping */
14110 (hashfunc) unicode_hash, /* tp_hash*/
14111 0, /* tp_call*/
14112 (reprfunc) unicode_str, /* tp_str */
14113 PyObject_GenericGetAttr, /* tp_getattro */
14114 0, /* tp_setattro */
14115 0, /* tp_as_buffer */
14116 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014117 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014118 unicode_doc, /* tp_doc */
14119 0, /* tp_traverse */
14120 0, /* tp_clear */
14121 PyUnicode_RichCompare, /* tp_richcompare */
14122 0, /* tp_weaklistoffset */
14123 unicode_iter, /* tp_iter */
14124 0, /* tp_iternext */
14125 unicode_methods, /* tp_methods */
14126 0, /* tp_members */
14127 0, /* tp_getset */
14128 &PyBaseObject_Type, /* tp_base */
14129 0, /* tp_dict */
14130 0, /* tp_descr_get */
14131 0, /* tp_descr_set */
14132 0, /* tp_dictoffset */
14133 0, /* tp_init */
14134 0, /* tp_alloc */
14135 unicode_new, /* tp_new */
14136 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014137};
14138
14139/* Initialize the Unicode implementation */
14140
Victor Stinner3a50e702011-10-18 21:21:00 +020014141int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014142{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014143 int i;
14144
Thomas Wouters477c8d52006-05-27 19:21:47 +000014145 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014146 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014147 0x000A, /* LINE FEED */
14148 0x000D, /* CARRIAGE RETURN */
14149 0x001C, /* FILE SEPARATOR */
14150 0x001D, /* GROUP SEPARATOR */
14151 0x001E, /* RECORD SEPARATOR */
14152 0x0085, /* NEXT LINE */
14153 0x2028, /* LINE SEPARATOR */
14154 0x2029, /* PARAGRAPH SEPARATOR */
14155 };
14156
Fred Drakee4315f52000-05-09 19:53:39 +000014157 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014158 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014159 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014160 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014161 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014162
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014163 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014164 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014165 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014166 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014167
14168 /* initialize the linebreak bloom filter */
14169 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014170 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014171 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014172
14173 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014174
14175#ifdef HAVE_MBCS
14176 winver.dwOSVersionInfoSize = sizeof(winver);
14177 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14178 PyErr_SetFromWindowsErr(0);
14179 return -1;
14180 }
14181#endif
14182 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014183}
14184
14185/* Finalize the Unicode implementation */
14186
Christian Heimesa156e092008-02-16 07:38:31 +000014187int
14188PyUnicode_ClearFreeList(void)
14189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014190 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014191}
14192
Guido van Rossumd57fd912000-03-10 22:53:23 +000014193void
Thomas Wouters78890102000-07-22 19:25:51 +000014194_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014195{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014196 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014197
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014198 Py_XDECREF(unicode_empty);
14199 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014201 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014202 if (unicode_latin1[i]) {
14203 Py_DECREF(unicode_latin1[i]);
14204 unicode_latin1[i] = NULL;
14205 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014206 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014207 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014208 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014209}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014210
Walter Dörwald16807132007-05-25 13:52:07 +000014211void
14212PyUnicode_InternInPlace(PyObject **p)
14213{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014214 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014215 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014216#ifdef Py_DEBUG
14217 assert(s != NULL);
14218 assert(_PyUnicode_CHECK(s));
14219#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014220 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014221 return;
14222#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014223 /* If it's a subclass, we don't really know what putting
14224 it in the interned dict might do. */
14225 if (!PyUnicode_CheckExact(s))
14226 return;
14227 if (PyUnicode_CHECK_INTERNED(s))
14228 return;
14229 if (interned == NULL) {
14230 interned = PyDict_New();
14231 if (interned == NULL) {
14232 PyErr_Clear(); /* Don't leave an exception */
14233 return;
14234 }
14235 }
14236 /* It might be that the GetItem call fails even
14237 though the key is present in the dictionary,
14238 namely when this happens during a stack overflow. */
14239 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014240 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014241 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014242
Benjamin Peterson29060642009-01-31 22:14:21 +000014243 if (t) {
14244 Py_INCREF(t);
14245 Py_DECREF(*p);
14246 *p = t;
14247 return;
14248 }
Walter Dörwald16807132007-05-25 13:52:07 +000014249
Benjamin Peterson14339b62009-01-31 16:36:08 +000014250 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014251 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 PyErr_Clear();
14253 PyThreadState_GET()->recursion_critical = 0;
14254 return;
14255 }
14256 PyThreadState_GET()->recursion_critical = 0;
14257 /* The two references in interned are not counted by refcnt.
14258 The deallocator will take care of this */
14259 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014260 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014261}
14262
14263void
14264PyUnicode_InternImmortal(PyObject **p)
14265{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014266 PyUnicode_InternInPlace(p);
14267 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014268 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014269 Py_INCREF(*p);
14270 }
Walter Dörwald16807132007-05-25 13:52:07 +000014271}
14272
14273PyObject *
14274PyUnicode_InternFromString(const char *cp)
14275{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014276 PyObject *s = PyUnicode_FromString(cp);
14277 if (s == NULL)
14278 return NULL;
14279 PyUnicode_InternInPlace(&s);
14280 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014281}
14282
Alexander Belopolsky40018472011-02-26 01:02:56 +000014283void
14284_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014285{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014287 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014288 Py_ssize_t i, n;
14289 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014290
Benjamin Peterson14339b62009-01-31 16:36:08 +000014291 if (interned == NULL || !PyDict_Check(interned))
14292 return;
14293 keys = PyDict_Keys(interned);
14294 if (keys == NULL || !PyList_Check(keys)) {
14295 PyErr_Clear();
14296 return;
14297 }
Walter Dörwald16807132007-05-25 13:52:07 +000014298
Benjamin Peterson14339b62009-01-31 16:36:08 +000014299 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14300 detector, interned unicode strings are not forcibly deallocated;
14301 rather, we give them their stolen references back, and then clear
14302 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014303
Benjamin Peterson14339b62009-01-31 16:36:08 +000014304 n = PyList_GET_SIZE(keys);
14305 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014306 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014307 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014308 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014309 if (PyUnicode_READY(s) == -1) {
14310 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014311 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014313 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014314 case SSTATE_NOT_INTERNED:
14315 /* XXX Shouldn't happen */
14316 break;
14317 case SSTATE_INTERNED_IMMORTAL:
14318 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014319 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014320 break;
14321 case SSTATE_INTERNED_MORTAL:
14322 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014323 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014324 break;
14325 default:
14326 Py_FatalError("Inconsistent interned string state.");
14327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014328 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014329 }
14330 fprintf(stderr, "total size of all interned strings: "
14331 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14332 "mortal/immortal\n", mortal_size, immortal_size);
14333 Py_DECREF(keys);
14334 PyDict_Clear(interned);
14335 Py_DECREF(interned);
14336 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014337}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014338
14339
14340/********************* Unicode Iterator **************************/
14341
14342typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014343 PyObject_HEAD
14344 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014345 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014346} unicodeiterobject;
14347
14348static void
14349unicodeiter_dealloc(unicodeiterobject *it)
14350{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014351 _PyObject_GC_UNTRACK(it);
14352 Py_XDECREF(it->it_seq);
14353 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014354}
14355
14356static int
14357unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14358{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014359 Py_VISIT(it->it_seq);
14360 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014361}
14362
14363static PyObject *
14364unicodeiter_next(unicodeiterobject *it)
14365{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014366 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014367
Benjamin Peterson14339b62009-01-31 16:36:08 +000014368 assert(it != NULL);
14369 seq = it->it_seq;
14370 if (seq == NULL)
14371 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014372 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014374 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14375 int kind = PyUnicode_KIND(seq);
14376 void *data = PyUnicode_DATA(seq);
14377 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14378 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014379 if (item != NULL)
14380 ++it->it_index;
14381 return item;
14382 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014383
Benjamin Peterson14339b62009-01-31 16:36:08 +000014384 Py_DECREF(seq);
14385 it->it_seq = NULL;
14386 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014387}
14388
14389static PyObject *
14390unicodeiter_len(unicodeiterobject *it)
14391{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014392 Py_ssize_t len = 0;
14393 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014394 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014395 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014396}
14397
14398PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14399
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014400static PyObject *
14401unicodeiter_reduce(unicodeiterobject *it)
14402{
14403 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014404 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014405 it->it_seq, it->it_index);
14406 } else {
14407 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14408 if (u == NULL)
14409 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014410 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014411 }
14412}
14413
14414PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14415
14416static PyObject *
14417unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14418{
14419 Py_ssize_t index = PyLong_AsSsize_t(state);
14420 if (index == -1 && PyErr_Occurred())
14421 return NULL;
14422 if (index < 0)
14423 index = 0;
14424 it->it_index = index;
14425 Py_RETURN_NONE;
14426}
14427
14428PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14429
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014430static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014431 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014432 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014433 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14434 reduce_doc},
14435 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14436 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014437 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014438};
14439
14440PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014441 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14442 "str_iterator", /* tp_name */
14443 sizeof(unicodeiterobject), /* tp_basicsize */
14444 0, /* tp_itemsize */
14445 /* methods */
14446 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14447 0, /* tp_print */
14448 0, /* tp_getattr */
14449 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014450 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014451 0, /* tp_repr */
14452 0, /* tp_as_number */
14453 0, /* tp_as_sequence */
14454 0, /* tp_as_mapping */
14455 0, /* tp_hash */
14456 0, /* tp_call */
14457 0, /* tp_str */
14458 PyObject_GenericGetAttr, /* tp_getattro */
14459 0, /* tp_setattro */
14460 0, /* tp_as_buffer */
14461 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14462 0, /* tp_doc */
14463 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14464 0, /* tp_clear */
14465 0, /* tp_richcompare */
14466 0, /* tp_weaklistoffset */
14467 PyObject_SelfIter, /* tp_iter */
14468 (iternextfunc)unicodeiter_next, /* tp_iternext */
14469 unicodeiter_methods, /* tp_methods */
14470 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014471};
14472
14473static PyObject *
14474unicode_iter(PyObject *seq)
14475{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014476 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014477
Benjamin Peterson14339b62009-01-31 16:36:08 +000014478 if (!PyUnicode_Check(seq)) {
14479 PyErr_BadInternalCall();
14480 return NULL;
14481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014482 if (PyUnicode_READY(seq) == -1)
14483 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014484 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14485 if (it == NULL)
14486 return NULL;
14487 it->it_index = 0;
14488 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014489 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014490 _PyObject_GC_TRACK(it);
14491 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014492}
14493
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014494
14495size_t
14496Py_UNICODE_strlen(const Py_UNICODE *u)
14497{
14498 int res = 0;
14499 while(*u++)
14500 res++;
14501 return res;
14502}
14503
14504Py_UNICODE*
14505Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14506{
14507 Py_UNICODE *u = s1;
14508 while ((*u++ = *s2++));
14509 return s1;
14510}
14511
14512Py_UNICODE*
14513Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14514{
14515 Py_UNICODE *u = s1;
14516 while ((*u++ = *s2++))
14517 if (n-- == 0)
14518 break;
14519 return s1;
14520}
14521
14522Py_UNICODE*
14523Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14524{
14525 Py_UNICODE *u1 = s1;
14526 u1 += Py_UNICODE_strlen(u1);
14527 Py_UNICODE_strcpy(u1, s2);
14528 return s1;
14529}
14530
14531int
14532Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14533{
14534 while (*s1 && *s2 && *s1 == *s2)
14535 s1++, s2++;
14536 if (*s1 && *s2)
14537 return (*s1 < *s2) ? -1 : +1;
14538 if (*s1)
14539 return 1;
14540 if (*s2)
14541 return -1;
14542 return 0;
14543}
14544
14545int
14546Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14547{
14548 register Py_UNICODE u1, u2;
14549 for (; n != 0; n--) {
14550 u1 = *s1;
14551 u2 = *s2;
14552 if (u1 != u2)
14553 return (u1 < u2) ? -1 : +1;
14554 if (u1 == '\0')
14555 return 0;
14556 s1++;
14557 s2++;
14558 }
14559 return 0;
14560}
14561
14562Py_UNICODE*
14563Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14564{
14565 const Py_UNICODE *p;
14566 for (p = s; *p; p++)
14567 if (*p == c)
14568 return (Py_UNICODE*)p;
14569 return NULL;
14570}
14571
14572Py_UNICODE*
14573Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14574{
14575 const Py_UNICODE *p;
14576 p = s + Py_UNICODE_strlen(s);
14577 while (p != s) {
14578 p--;
14579 if (*p == c)
14580 return (Py_UNICODE*)p;
14581 }
14582 return NULL;
14583}
Victor Stinner331ea922010-08-10 16:37:20 +000014584
Victor Stinner71133ff2010-09-01 23:43:53 +000014585Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014586PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014587{
Victor Stinner577db2c2011-10-11 22:12:48 +020014588 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014589 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014591 if (!PyUnicode_Check(unicode)) {
14592 PyErr_BadArgument();
14593 return NULL;
14594 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014595 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014596 if (u == NULL)
14597 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014598 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014599 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014600 PyErr_NoMemory();
14601 return NULL;
14602 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014603 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014604 size *= sizeof(Py_UNICODE);
14605 copy = PyMem_Malloc(size);
14606 if (copy == NULL) {
14607 PyErr_NoMemory();
14608 return NULL;
14609 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014610 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014611 return copy;
14612}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014613
Georg Brandl66c221e2010-10-14 07:04:07 +000014614/* A _string module, to export formatter_parser and formatter_field_name_split
14615 to the string.Formatter class implemented in Python. */
14616
14617static PyMethodDef _string_methods[] = {
14618 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14619 METH_O, PyDoc_STR("split the argument as a field name")},
14620 {"formatter_parser", (PyCFunction) formatter_parser,
14621 METH_O, PyDoc_STR("parse the argument as a format string")},
14622 {NULL, NULL}
14623};
14624
14625static struct PyModuleDef _string_module = {
14626 PyModuleDef_HEAD_INIT,
14627 "_string",
14628 PyDoc_STR("string helper module"),
14629 0,
14630 _string_methods,
14631 NULL,
14632 NULL,
14633 NULL,
14634 NULL
14635};
14636
14637PyMODINIT_FUNC
14638PyInit__string(void)
14639{
14640 return PyModule_Create(&_string_module);
14641}
14642
14643
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014644#ifdef __cplusplus
14645}
14646#endif