blob: afe7a9fa68d2c4298987407ca450c9f865f52a4d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
Victor Stinner15e9ed22012-02-22 13:36:20 +01001001 assert(maxchar <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 kind_state = PyUnicode_4BYTE_KIND;
1003 char_size = 4;
1004 if (sizeof(wchar_t) == 4)
1005 is_sharing = 1;
1006 }
1007
1008 /* Ensure we won't overflow the size. */
1009 if (size < 0) {
1010 PyErr_SetString(PyExc_SystemError,
1011 "Negative size passed to PyUnicode_New");
1012 return NULL;
1013 }
1014 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1015 return PyErr_NoMemory();
1016
1017 /* Duplicated allocation code from _PyObject_New() instead of a call to
1018 * PyObject_New() so we are able to allocate space for the object and
1019 * it's data buffer.
1020 */
1021 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1022 if (obj == NULL)
1023 return PyErr_NoMemory();
1024 obj = PyObject_INIT(obj, &PyUnicode_Type);
1025 if (obj == NULL)
1026 return NULL;
1027
1028 unicode = (PyCompactUnicodeObject *)obj;
1029 if (is_ascii)
1030 data = ((PyASCIIObject*)obj) + 1;
1031 else
1032 data = unicode + 1;
1033 _PyUnicode_LENGTH(unicode) = size;
1034 _PyUnicode_HASH(unicode) = -1;
1035 _PyUnicode_STATE(unicode).interned = 0;
1036 _PyUnicode_STATE(unicode).kind = kind_state;
1037 _PyUnicode_STATE(unicode).compact = 1;
1038 _PyUnicode_STATE(unicode).ready = 1;
1039 _PyUnicode_STATE(unicode).ascii = is_ascii;
1040 if (is_ascii) {
1041 ((char*)data)[size] = 0;
1042 _PyUnicode_WSTR(unicode) = NULL;
1043 }
1044 else if (kind_state == PyUnicode_1BYTE_KIND) {
1045 ((char*)data)[size] = 0;
1046 _PyUnicode_WSTR(unicode) = NULL;
1047 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001049 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 }
1051 else {
1052 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001053 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 if (kind_state == PyUnicode_2BYTE_KIND)
1055 ((Py_UCS2*)data)[size] = 0;
1056 else /* kind_state == PyUnicode_4BYTE_KIND */
1057 ((Py_UCS4*)data)[size] = 0;
1058 if (is_sharing) {
1059 _PyUnicode_WSTR_LENGTH(unicode) = size;
1060 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1061 }
1062 else {
1063 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1064 _PyUnicode_WSTR(unicode) = NULL;
1065 }
1066 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001067 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 return obj;
1069}
1070
1071#if SIZEOF_WCHAR_T == 2
1072/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1073 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001074 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075
1076 This function assumes that unicode can hold one more code point than wstr
1077 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001078static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001080 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081{
1082 const wchar_t *iter;
1083 Py_UCS4 *ucs4_out;
1084
Victor Stinner910337b2011-10-03 03:20:16 +02001085 assert(unicode != NULL);
1086 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1088 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1089
1090 for (iter = begin; iter < end; ) {
1091 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1092 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001093 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1094 && (iter+1) < end
1095 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096 {
Victor Stinner551ac952011-11-29 22:58:13 +01001097 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 iter += 2;
1099 }
1100 else {
1101 *ucs4_out++ = *iter;
1102 iter++;
1103 }
1104 }
1105 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1106 _PyUnicode_GET_LENGTH(unicode)));
1107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108}
1109#endif
1110
Victor Stinnercd9950f2011-10-02 00:34:53 +02001111static int
Victor Stinner488fa492011-12-12 00:01:39 +01001112unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001113{
Victor Stinner488fa492011-12-12 00:01:39 +01001114 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001115 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001116 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001117 return -1;
1118 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001119 return 0;
1120}
1121
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001122static int
1123_copy_characters(PyObject *to, Py_ssize_t to_start,
1124 PyObject *from, Py_ssize_t from_start,
1125 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001127 unsigned int from_kind, to_kind;
1128 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001129 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001131 assert(PyUnicode_Check(from));
1132 assert(PyUnicode_Check(to));
1133 assert(PyUnicode_IS_READY(from));
1134 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001136 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1137 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1138 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001140 if (how_many == 0)
1141 return 0;
1142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001144 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001146 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001148#ifdef Py_DEBUG
1149 if (!check_maxchar
1150 && (from_kind > to_kind
1151 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001153 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1154 Py_UCS4 ch;
1155 Py_ssize_t i;
1156 for (i=0; i < how_many; i++) {
1157 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1158 assert(ch <= to_maxchar);
1159 }
1160 }
1161#endif
1162 fast = (from_kind == to_kind);
1163 if (check_maxchar
1164 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1165 {
1166 /* deny latin1 => ascii */
1167 fast = 0;
1168 }
1169
1170 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001171 Py_MEMCPY((char*)to_data + to_kind * to_start,
1172 (char*)from_data + from_kind * from_start,
1173 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001175 else if (from_kind == PyUnicode_1BYTE_KIND
1176 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001177 {
1178 _PyUnicode_CONVERT_BYTES(
1179 Py_UCS1, Py_UCS2,
1180 PyUnicode_1BYTE_DATA(from) + from_start,
1181 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1182 PyUnicode_2BYTE_DATA(to) + to_start
1183 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001184 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001185 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001186 && to_kind == PyUnicode_4BYTE_KIND)
1187 {
1188 _PyUnicode_CONVERT_BYTES(
1189 Py_UCS1, Py_UCS4,
1190 PyUnicode_1BYTE_DATA(from) + from_start,
1191 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1192 PyUnicode_4BYTE_DATA(to) + to_start
1193 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001194 }
1195 else if (from_kind == PyUnicode_2BYTE_KIND
1196 && to_kind == PyUnicode_4BYTE_KIND)
1197 {
1198 _PyUnicode_CONVERT_BYTES(
1199 Py_UCS2, Py_UCS4,
1200 PyUnicode_2BYTE_DATA(from) + from_start,
1201 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1202 PyUnicode_4BYTE_DATA(to) + to_start
1203 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001204 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001206 /* check if max_char(from substring) <= max_char(to) */
1207 if (from_kind > to_kind
1208 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001209 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001210 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 /* slow path to check for character overflow */
1212 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001213 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001214 Py_ssize_t i;
1215
Victor Stinner56c161a2011-10-06 02:47:11 +02001216#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001217 for (i=0; i < how_many; i++) {
1218 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001219 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1221 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001222#else
1223 if (!check_maxchar) {
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1227 }
1228 }
1229 else {
1230 for (i=0; i < how_many; i++) {
1231 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1232 if (ch > to_maxchar)
1233 return 1;
1234 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1235 }
1236 }
1237#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001238 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001239 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001240 assert(0 && "inconsistent state");
1241 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001242 }
1243 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001244 return 0;
1245}
1246
1247static void
1248copy_characters(PyObject *to, Py_ssize_t to_start,
1249 PyObject *from, Py_ssize_t from_start,
1250 Py_ssize_t how_many)
1251{
1252 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1253}
1254
1255Py_ssize_t
1256PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1257 PyObject *from, Py_ssize_t from_start,
1258 Py_ssize_t how_many)
1259{
1260 int err;
1261
1262 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1263 PyErr_BadInternalCall();
1264 return -1;
1265 }
1266
Benjamin Petersonbac79492012-01-14 13:34:47 -05001267 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001268 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001269 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001270 return -1;
1271
1272 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1273 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1274 PyErr_Format(PyExc_SystemError,
1275 "Cannot write %zi characters at %zi "
1276 "in a string of %zi characters",
1277 how_many, to_start, PyUnicode_GET_LENGTH(to));
1278 return -1;
1279 }
1280
1281 if (how_many == 0)
1282 return 0;
1283
Victor Stinner488fa492011-12-12 00:01:39 +01001284 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001285 return -1;
1286
1287 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1288 if (err) {
1289 PyErr_Format(PyExc_SystemError,
1290 "Cannot copy %s characters "
1291 "into a string of %s characters",
1292 unicode_kind_name(from),
1293 unicode_kind_name(to));
1294 return -1;
1295 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001296 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297}
1298
Victor Stinner17222162011-09-28 22:15:37 +02001299/* Find the maximum code point and count the number of surrogate pairs so a
1300 correct string length can be computed before converting a string to UCS4.
1301 This function counts single surrogates as a character and not as a pair.
1302
1303 Return 0 on success, or -1 on error. */
1304static int
1305find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1306 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307{
1308 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001309 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310
Victor Stinnerc53be962011-10-02 21:33:54 +02001311 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 *num_surrogates = 0;
1313 *maxchar = 0;
1314
1315 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001317 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1318 && (iter+1) < end
1319 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001321 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 iter += 2;
1324 }
1325 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001327 {
1328 ch = *iter;
1329 iter++;
1330 }
1331 if (ch > *maxchar) {
1332 *maxchar = ch;
1333 if (*maxchar > MAX_UNICODE) {
1334 PyErr_Format(PyExc_ValueError,
1335 "character U+%x is not in range [U+0000; U+10ffff]",
1336 ch);
1337 return -1;
1338 }
1339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 }
1341 return 0;
1342}
1343
1344#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001345static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346#endif
1347
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001348int
1349_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350{
1351 wchar_t *end;
1352 Py_UCS4 maxchar = 0;
1353 Py_ssize_t num_surrogates;
1354#if SIZEOF_WCHAR_T == 2
1355 Py_ssize_t length_wo_surrogates;
1356#endif
1357
Georg Brandl7597add2011-10-05 16:36:47 +02001358 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001359 strings were created using _PyObject_New() and where no canonical
1360 representation (the str field) has been set yet aka strings
1361 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001362 assert(_PyUnicode_CHECK(unicode));
1363 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001365 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001366 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001367 /* Actually, it should neither be interned nor be anything else: */
1368 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369
1370#ifdef Py_DEBUG
1371 ++unicode_ready_calls;
1372#endif
1373
1374 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001375 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001376 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378
1379 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001380 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1381 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 PyErr_NoMemory();
1383 return -1;
1384 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001385 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 _PyUnicode_WSTR(unicode), end,
1387 PyUnicode_1BYTE_DATA(unicode));
1388 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1389 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1390 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1391 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001392 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001393 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001394 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001397 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001398 _PyUnicode_UTF8(unicode) = NULL;
1399 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 }
1401 PyObject_FREE(_PyUnicode_WSTR(unicode));
1402 _PyUnicode_WSTR(unicode) = NULL;
1403 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1404 }
1405 /* In this case we might have to convert down from 4-byte native
1406 wchar_t to 2-byte unicode. */
1407 else if (maxchar < 65536) {
1408 assert(num_surrogates == 0 &&
1409 "FindMaxCharAndNumSurrogatePairs() messed up");
1410
Victor Stinner506f5922011-09-28 22:34:18 +02001411#if SIZEOF_WCHAR_T == 2
1412 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001413 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001414 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1415 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1416 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8(unicode) = NULL;
1418 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001419#else
1420 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001421 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001422 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001423 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001424 PyErr_NoMemory();
1425 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 }
Victor Stinner506f5922011-09-28 22:34:18 +02001427 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1428 _PyUnicode_WSTR(unicode), end,
1429 PyUnicode_2BYTE_DATA(unicode));
1430 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1431 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1432 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001433 _PyUnicode_UTF8(unicode) = NULL;
1434 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001435 PyObject_FREE(_PyUnicode_WSTR(unicode));
1436 _PyUnicode_WSTR(unicode) = NULL;
1437 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1438#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
1440 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1441 else {
1442#if SIZEOF_WCHAR_T == 2
1443 /* in case the native representation is 2-bytes, we need to allocate a
1444 new normalized 4-byte version. */
1445 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001446 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1447 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 PyErr_NoMemory();
1449 return -1;
1450 }
1451 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1452 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001453 _PyUnicode_UTF8(unicode) = NULL;
1454 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001455 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1456 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001457 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 PyObject_FREE(_PyUnicode_WSTR(unicode));
1459 _PyUnicode_WSTR(unicode) = NULL;
1460 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1461#else
1462 assert(num_surrogates == 0);
1463
Victor Stinnerc3c74152011-10-02 20:39:55 +02001464 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001466 _PyUnicode_UTF8(unicode) = NULL;
1467 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1469#endif
1470 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1471 }
1472 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001473 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 return 0;
1475}
1476
Alexander Belopolsky40018472011-02-26 01:02:56 +00001477static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001478unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479{
Walter Dörwald16807132007-05-25 13:52:07 +00001480 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001481 case SSTATE_NOT_INTERNED:
1482 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001483
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 case SSTATE_INTERNED_MORTAL:
1485 /* revive dead object temporarily for DelItem */
1486 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001487 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 Py_FatalError(
1489 "deletion of interned string failed");
1490 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001491
Benjamin Peterson29060642009-01-31 22:14:21 +00001492 case SSTATE_INTERNED_IMMORTAL:
1493 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001494
Benjamin Peterson29060642009-01-31 22:14:21 +00001495 default:
1496 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001497 }
1498
Victor Stinner03490912011-10-03 23:45:12 +02001499 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001501 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001502 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001503 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1504 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001506 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001509#ifdef Py_DEBUG
1510static int
1511unicode_is_singleton(PyObject *unicode)
1512{
1513 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1514 if (unicode == unicode_empty)
1515 return 1;
1516 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1517 {
1518 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1519 if (ch < 256 && unicode_latin1[ch] == unicode)
1520 return 1;
1521 }
1522 return 0;
1523}
1524#endif
1525
Alexander Belopolsky40018472011-02-26 01:02:56 +00001526static int
Victor Stinner488fa492011-12-12 00:01:39 +01001527unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001528{
Victor Stinner488fa492011-12-12 00:01:39 +01001529 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001530 if (Py_REFCNT(unicode) != 1)
1531 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001532 if (_PyUnicode_HASH(unicode) != -1)
1533 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001534 if (PyUnicode_CHECK_INTERNED(unicode))
1535 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001536 if (!PyUnicode_CheckExact(unicode))
1537 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001538#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 /* singleton refcount is greater than 1 */
1540 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001541#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 return 1;
1543}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001544
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545static int
1546unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1547{
1548 PyObject *unicode;
1549 Py_ssize_t old_length;
1550
1551 assert(p_unicode != NULL);
1552 unicode = *p_unicode;
1553
1554 assert(unicode != NULL);
1555 assert(PyUnicode_Check(unicode));
1556 assert(0 <= length);
1557
Victor Stinner910337b2011-10-03 03:20:16 +02001558 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001559 old_length = PyUnicode_WSTR_LENGTH(unicode);
1560 else
1561 old_length = PyUnicode_GET_LENGTH(unicode);
1562 if (old_length == length)
1563 return 0;
1564
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001565 if (length == 0) {
1566 Py_DECREF(*p_unicode);
1567 *p_unicode = unicode_empty;
1568 Py_INCREF(*p_unicode);
1569 return 0;
1570 }
1571
Victor Stinner488fa492011-12-12 00:01:39 +01001572 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 PyObject *copy = resize_copy(unicode, length);
1574 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001576 Py_DECREF(*p_unicode);
1577 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001579 }
1580
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001582 PyObject *new_unicode = resize_compact(unicode, length);
1583 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001585 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001586 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001587 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001588 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001589 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001590}
1591
Alexander Belopolsky40018472011-02-26 01:02:56 +00001592int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001593PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001594{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 PyObject *unicode;
1596 if (p_unicode == NULL) {
1597 PyErr_BadInternalCall();
1598 return -1;
1599 }
1600 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001601 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001602 {
1603 PyErr_BadInternalCall();
1604 return -1;
1605 }
1606 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001607}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001608
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001609static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001610unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001611{
1612 PyObject *result;
1613 assert(PyUnicode_IS_READY(*p_unicode));
1614 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1615 return 0;
1616 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1617 maxchar);
1618 if (result == NULL)
1619 return -1;
1620 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1621 PyUnicode_GET_LENGTH(*p_unicode));
1622 Py_DECREF(*p_unicode);
1623 *p_unicode = result;
1624 return 0;
1625}
1626
1627static int
1628unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1629 Py_UCS4 ch)
1630{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001631 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001632 if (unicode_widen(p_unicode, ch) < 0)
1633 return -1;
1634 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1635 PyUnicode_DATA(*p_unicode),
1636 (*pos)++, ch);
1637 return 0;
1638}
1639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640static PyObject*
1641get_latin1_char(unsigned char ch)
1642{
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001645 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 if (!unicode)
1647 return NULL;
1648 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001649 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 unicode_latin1[ch] = unicode;
1651 }
1652 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001653 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654}
1655
Alexander Belopolsky40018472011-02-26 01:02:56 +00001656PyObject *
1657PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001659 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 Py_UCS4 maxchar = 0;
1661 Py_ssize_t num_surrogates;
1662
1663 if (u == NULL)
1664 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666 /* If the Unicode data is known at construction time, we can apply
1667 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 /* Optimization for empty strings */
1670 if (size == 0 && unicode_empty != NULL) {
1671 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001672 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001673 }
Tim Petersced69f82003-09-16 20:30:58 +00001674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 /* Single character Unicode objects in the Latin-1 range are
1676 shared when using this constructor */
1677 if (size == 1 && *u < 256)
1678 return get_latin1_char((unsigned char)*u);
1679
1680 /* If not empty and not single character, copy the Unicode data
1681 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001682 if (find_maxchar_surrogates(u, u + size,
1683 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 return NULL;
1685
Victor Stinner8faf8212011-12-08 22:14:11 +01001686 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 if (!unicode)
1688 return NULL;
1689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 switch (PyUnicode_KIND(unicode)) {
1691 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001692 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1694 break;
1695 case PyUnicode_2BYTE_KIND:
1696#if Py_UNICODE_SIZE == 2
1697 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1698#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001699 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1701#endif
1702 break;
1703 case PyUnicode_4BYTE_KIND:
1704#if SIZEOF_WCHAR_T == 2
1705 /* This is the only case which has to process surrogates, thus
1706 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001707 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708#else
1709 assert(num_surrogates == 0);
1710 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1711#endif
1712 break;
1713 default:
1714 assert(0 && "Impossible state");
1715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001717 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718}
1719
Alexander Belopolsky40018472011-02-26 01:02:56 +00001720PyObject *
1721PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001722{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001723 if (size < 0) {
1724 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001725 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001726 return NULL;
1727 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001728 if (u != NULL)
1729 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1730 else
1731 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001732}
1733
Alexander Belopolsky40018472011-02-26 01:02:56 +00001734PyObject *
1735PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001736{
1737 size_t size = strlen(u);
1738 if (size > PY_SSIZE_T_MAX) {
1739 PyErr_SetString(PyExc_OverflowError, "input too long");
1740 return NULL;
1741 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001742 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001743}
1744
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001745PyObject *
1746_PyUnicode_FromId(_Py_Identifier *id)
1747{
1748 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001749 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1750 strlen(id->string),
1751 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001752 if (!id->object)
1753 return NULL;
1754 PyUnicode_InternInPlace(&id->object);
1755 assert(!id->next);
1756 id->next = static_strings;
1757 static_strings = id;
1758 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001759 return id->object;
1760}
1761
1762void
1763_PyUnicode_ClearStaticStrings()
1764{
1765 _Py_Identifier *i;
1766 for (i = static_strings; i; i = i->next) {
1767 Py_DECREF(i->object);
1768 i->object = NULL;
1769 i->next = NULL;
1770 }
1771}
1772
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001773/* Internal function, don't check maximum character */
1774
Victor Stinnere57b1c02011-09-28 22:20:48 +02001775static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001776unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001777{
Victor Stinner785938e2011-12-11 20:09:03 +01001778 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001781 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001782#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001783 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001784 }
Victor Stinner785938e2011-12-11 20:09:03 +01001785 unicode = PyUnicode_New(size, 127);
1786 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001787 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001788 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1789 assert(_PyUnicode_CheckConsistency(unicode, 1));
1790 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001791}
1792
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001793static Py_UCS4
1794kind_maxchar_limit(unsigned int kind)
1795{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001796 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001797 case PyUnicode_1BYTE_KIND:
1798 return 0x80;
1799 case PyUnicode_2BYTE_KIND:
1800 return 0x100;
1801 case PyUnicode_4BYTE_KIND:
1802 return 0x10000;
1803 default:
1804 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001805 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001806 }
1807}
1808
Victor Stinner702c7342011-10-05 13:50:52 +02001809static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001810_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001814
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001815 if (size == 0) {
1816 Py_INCREF(unicode_empty);
1817 return unicode_empty;
1818 }
1819 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001820 if (size == 1)
1821 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001822
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001823 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001824 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 if (!res)
1826 return NULL;
1827 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001828 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001830}
1831
Victor Stinnere57b1c02011-09-28 22:20:48 +02001832static PyObject*
1833_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834{
1835 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001837
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001838 if (size == 0) {
1839 Py_INCREF(unicode_empty);
1840 return unicode_empty;
1841 }
1842 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001843 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001844 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001845
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001846 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001847 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 if (!res)
1849 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001850 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001852 else {
1853 _PyUnicode_CONVERT_BYTES(
1854 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1855 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001856 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 return res;
1858}
1859
Victor Stinnere57b1c02011-09-28 22:20:48 +02001860static PyObject*
1861_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862{
1863 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001865
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001866 if (size == 0) {
1867 Py_INCREF(unicode_empty);
1868 return unicode_empty;
1869 }
1870 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001871 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001872 return get_latin1_char((unsigned char)u[0]);
1873
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001874 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001875 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 if (!res)
1877 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001878 if (max_char < 256)
1879 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1880 PyUnicode_1BYTE_DATA(res));
1881 else if (max_char < 0x10000)
1882 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1883 PyUnicode_2BYTE_DATA(res));
1884 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001886 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887 return res;
1888}
1889
1890PyObject*
1891PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1892{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001893 if (size < 0) {
1894 PyErr_SetString(PyExc_ValueError, "size must be positive");
1895 return NULL;
1896 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001897 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001903 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001904 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001905 PyErr_SetString(PyExc_SystemError, "invalid kind");
1906 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908}
1909
Victor Stinner25a4b292011-10-06 12:31:55 +02001910/* Ensure that a string uses the most efficient storage, if it is not the
1911 case: create a new string with of the right kind. Write NULL into *p_unicode
1912 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001913static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001914unicode_adjust_maxchar(PyObject **p_unicode)
1915{
1916 PyObject *unicode, *copy;
1917 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001918 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001919 unsigned int kind;
1920
1921 assert(p_unicode != NULL);
1922 unicode = *p_unicode;
1923 assert(PyUnicode_IS_READY(unicode));
1924 if (PyUnicode_IS_ASCII(unicode))
1925 return;
1926
1927 len = PyUnicode_GET_LENGTH(unicode);
1928 kind = PyUnicode_KIND(unicode);
1929 if (kind == PyUnicode_1BYTE_KIND) {
1930 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001931 max_char = ucs1lib_find_max_char(u, u + len);
1932 if (max_char >= 128)
1933 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001934 }
1935 else if (kind == PyUnicode_2BYTE_KIND) {
1936 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs2lib_find_max_char(u, u + len);
1938 if (max_char >= 256)
1939 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001940 }
1941 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001943 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 max_char = ucs4lib_find_max_char(u, u + len);
1945 if (max_char >= 0x10000)
1946 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001947 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001948 copy = PyUnicode_New(len, max_char);
1949 copy_characters(copy, 0, unicode, 0, len);
1950 Py_DECREF(unicode);
1951 *p_unicode = copy;
1952}
1953
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001955_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001956{
Victor Stinner87af4f22011-11-21 23:03:47 +01001957 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001958 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001959
Victor Stinner034f6cf2011-09-30 02:26:44 +02001960 if (!PyUnicode_Check(unicode)) {
1961 PyErr_BadInternalCall();
1962 return NULL;
1963 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05001964 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001965 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001966
Victor Stinner87af4f22011-11-21 23:03:47 +01001967 length = PyUnicode_GET_LENGTH(unicode);
1968 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001969 if (!copy)
1970 return NULL;
1971 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1972
Victor Stinner87af4f22011-11-21 23:03:47 +01001973 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1974 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001975 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001976 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001977}
1978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979
Victor Stinnerbc603d12011-10-02 01:00:40 +02001980/* Widen Unicode objects to larger buffers. Don't write terminating null
1981 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982
1983void*
1984_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1985{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001986 Py_ssize_t len;
1987 void *result;
1988 unsigned int skind;
1989
Benjamin Petersonbac79492012-01-14 13:34:47 -05001990 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02001991 return NULL;
1992
1993 len = PyUnicode_GET_LENGTH(s);
1994 skind = PyUnicode_KIND(s);
1995 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001996 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 return NULL;
1998 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001999 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002000 case PyUnicode_2BYTE_KIND:
2001 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2002 if (!result)
2003 return PyErr_NoMemory();
2004 assert(skind == PyUnicode_1BYTE_KIND);
2005 _PyUnicode_CONVERT_BYTES(
2006 Py_UCS1, Py_UCS2,
2007 PyUnicode_1BYTE_DATA(s),
2008 PyUnicode_1BYTE_DATA(s) + len,
2009 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002011 case PyUnicode_4BYTE_KIND:
2012 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2013 if (!result)
2014 return PyErr_NoMemory();
2015 if (skind == PyUnicode_2BYTE_KIND) {
2016 _PyUnicode_CONVERT_BYTES(
2017 Py_UCS2, Py_UCS4,
2018 PyUnicode_2BYTE_DATA(s),
2019 PyUnicode_2BYTE_DATA(s) + len,
2020 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002022 else {
2023 assert(skind == PyUnicode_1BYTE_KIND);
2024 _PyUnicode_CONVERT_BYTES(
2025 Py_UCS1, Py_UCS4,
2026 PyUnicode_1BYTE_DATA(s),
2027 PyUnicode_1BYTE_DATA(s) + len,
2028 result);
2029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002031 default:
2032 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 }
Victor Stinner01698042011-10-04 00:04:26 +02002034 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return NULL;
2036}
2037
2038static Py_UCS4*
2039as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2040 int copy_null)
2041{
2042 int kind;
2043 void *data;
2044 Py_ssize_t len, targetlen;
2045 if (PyUnicode_READY(string) == -1)
2046 return NULL;
2047 kind = PyUnicode_KIND(string);
2048 data = PyUnicode_DATA(string);
2049 len = PyUnicode_GET_LENGTH(string);
2050 targetlen = len;
2051 if (copy_null)
2052 targetlen++;
2053 if (!target) {
2054 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2055 PyErr_NoMemory();
2056 return NULL;
2057 }
2058 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2059 if (!target) {
2060 PyErr_NoMemory();
2061 return NULL;
2062 }
2063 }
2064 else {
2065 if (targetsize < targetlen) {
2066 PyErr_Format(PyExc_SystemError,
2067 "string is longer than the buffer");
2068 if (copy_null && 0 < targetsize)
2069 target[0] = 0;
2070 return NULL;
2071 }
2072 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002073 if (kind == PyUnicode_1BYTE_KIND) {
2074 Py_UCS1 *start = (Py_UCS1 *) data;
2075 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002077 else if (kind == PyUnicode_2BYTE_KIND) {
2078 Py_UCS2 *start = (Py_UCS2 *) data;
2079 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2080 }
2081 else {
2082 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 if (copy_null)
2086 target[len] = 0;
2087 return target;
2088}
2089
2090Py_UCS4*
2091PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2092 int copy_null)
2093{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002094 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 PyErr_BadInternalCall();
2096 return NULL;
2097 }
2098 return as_ucs4(string, target, targetsize, copy_null);
2099}
2100
2101Py_UCS4*
2102PyUnicode_AsUCS4Copy(PyObject *string)
2103{
2104 return as_ucs4(string, NULL, 0, 1);
2105}
2106
2107#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002108
Alexander Belopolsky40018472011-02-26 01:02:56 +00002109PyObject *
2110PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002113 if (size == 0) {
2114 Py_INCREF(unicode_empty);
2115 return unicode_empty;
2116 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002117 PyErr_BadInternalCall();
2118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 }
2120
Martin v. Löwis790465f2008-04-05 20:41:37 +00002121 if (size == -1) {
2122 size = wcslen(w);
2123 }
2124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126}
2127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002129
Walter Dörwald346737f2007-05-31 10:44:43 +00002130static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002131makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2132 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002133{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002134 *fmt++ = '%';
2135 if (width) {
2136 if (zeropad)
2137 *fmt++ = '0';
2138 fmt += sprintf(fmt, "%d", width);
2139 }
2140 if (precision)
2141 fmt += sprintf(fmt, ".%d", precision);
2142 if (longflag)
2143 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002144 else if (longlongflag) {
2145 /* longlongflag should only ever be nonzero on machines with
2146 HAVE_LONG_LONG defined */
2147#ifdef HAVE_LONG_LONG
2148 char *f = PY_FORMAT_LONG_LONG;
2149 while (*f)
2150 *fmt++ = *f++;
2151#else
2152 /* we shouldn't ever get here */
2153 assert(0);
2154 *fmt++ = 'l';
2155#endif
2156 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002157 else if (size_tflag) {
2158 char *f = PY_FORMAT_SIZE_T;
2159 while (*f)
2160 *fmt++ = *f++;
2161 }
2162 *fmt++ = c;
2163 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002164}
2165
Victor Stinner96865452011-03-01 23:44:09 +00002166/* helper for PyUnicode_FromFormatV() */
2167
2168static const char*
2169parse_format_flags(const char *f,
2170 int *p_width, int *p_precision,
2171 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2172{
2173 int width, precision, longflag, longlongflag, size_tflag;
2174
2175 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2176 f++;
2177 width = 0;
2178 while (Py_ISDIGIT((unsigned)*f))
2179 width = (width*10) + *f++ - '0';
2180 precision = 0;
2181 if (*f == '.') {
2182 f++;
2183 while (Py_ISDIGIT((unsigned)*f))
2184 precision = (precision*10) + *f++ - '0';
2185 if (*f == '%') {
2186 /* "%.3%s" => f points to "3" */
2187 f--;
2188 }
2189 }
2190 if (*f == '\0') {
2191 /* bogus format "%.1" => go backward, f points to "1" */
2192 f--;
2193 }
2194 if (p_width != NULL)
2195 *p_width = width;
2196 if (p_precision != NULL)
2197 *p_precision = precision;
2198
2199 /* Handle %ld, %lu, %lld and %llu. */
2200 longflag = 0;
2201 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002202 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002203
2204 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002205 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002206 longflag = 1;
2207 ++f;
2208 }
2209#ifdef HAVE_LONG_LONG
2210 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002211 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002212 longlongflag = 1;
2213 f += 2;
2214 }
2215#endif
2216 }
2217 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002218 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002219 size_tflag = 1;
2220 ++f;
2221 }
2222 if (p_longflag != NULL)
2223 *p_longflag = longflag;
2224 if (p_longlongflag != NULL)
2225 *p_longlongflag = longlongflag;
2226 if (p_size_tflag != NULL)
2227 *p_size_tflag = size_tflag;
2228 return f;
2229}
2230
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002231/* maximum number of characters required for output of %ld. 21 characters
2232 allows for 64-bit integers (in decimal) and an optional sign. */
2233#define MAX_LONG_CHARS 21
2234/* maximum number of characters required for output of %lld.
2235 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2236 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2237#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2238
Walter Dörwaldd2034312007-05-18 16:29:38 +00002239PyObject *
2240PyUnicode_FromFormatV(const char *format, va_list vargs)
2241{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002242 va_list count;
2243 Py_ssize_t callcount = 0;
2244 PyObject **callresults = NULL;
2245 PyObject **callresult = NULL;
2246 Py_ssize_t n = 0;
2247 int width = 0;
2248 int precision = 0;
2249 int zeropad;
2250 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002251 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002252 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002253 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2255 Py_UCS4 argmaxchar;
2256 Py_ssize_t numbersize = 0;
2257 char *numberresults = NULL;
2258 char *numberresult = NULL;
2259 Py_ssize_t i;
2260 int kind;
2261 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002262
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002263 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002264 /* step 1: count the number of %S/%R/%A/%s format specifications
2265 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2266 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002268 * also estimate a upper bound for all the number formats in the string,
2269 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 for (f = format; *f; f++) {
2272 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002273 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2275 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2276 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2277 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002280#ifdef HAVE_LONG_LONG
2281 if (longlongflag) {
2282 if (width < MAX_LONG_LONG_CHARS)
2283 width = MAX_LONG_LONG_CHARS;
2284 }
2285 else
2286#endif
2287 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2288 including sign. Decimal takes the most space. This
2289 isn't enough for octal. If a width is specified we
2290 need more (which we allocate later). */
2291 if (width < MAX_LONG_CHARS)
2292 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293
2294 /* account for the size + '\0' to separate numbers
2295 inside of the numberresults buffer */
2296 numbersize += (width + 1);
2297 }
2298 }
2299 else if ((unsigned char)*f > 127) {
2300 PyErr_Format(PyExc_ValueError,
2301 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2302 "string, got a non-ASCII byte: 0x%02x",
2303 (unsigned char)*f);
2304 return NULL;
2305 }
2306 }
2307 /* step 2: allocate memory for the results of
2308 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2309 if (callcount) {
2310 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2311 if (!callresults) {
2312 PyErr_NoMemory();
2313 return NULL;
2314 }
2315 callresult = callresults;
2316 }
2317 /* step 2.5: allocate memory for the results of formating numbers */
2318 if (numbersize) {
2319 numberresults = PyObject_Malloc(numbersize);
2320 if (!numberresults) {
2321 PyErr_NoMemory();
2322 goto fail;
2323 }
2324 numberresult = numberresults;
2325 }
2326
2327 /* step 3: format numbers and figure out how large a buffer we need */
2328 for (f = format; *f; f++) {
2329 if (*f == '%') {
2330 const char* p;
2331 int longflag;
2332 int longlongflag;
2333 int size_tflag;
2334 int numprinted;
2335
2336 p = f;
2337 zeropad = (f[1] == '0');
2338 f = parse_format_flags(f, &width, &precision,
2339 &longflag, &longlongflag, &size_tflag);
2340 switch (*f) {
2341 case 'c':
2342 {
2343 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002344 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 n++;
2346 break;
2347 }
2348 case '%':
2349 n++;
2350 break;
2351 case 'i':
2352 case 'd':
2353 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2354 width, precision, *f);
2355 if (longflag)
2356 numprinted = sprintf(numberresult, fmt,
2357 va_arg(count, long));
2358#ifdef HAVE_LONG_LONG
2359 else if (longlongflag)
2360 numprinted = sprintf(numberresult, fmt,
2361 va_arg(count, PY_LONG_LONG));
2362#endif
2363 else if (size_tflag)
2364 numprinted = sprintf(numberresult, fmt,
2365 va_arg(count, Py_ssize_t));
2366 else
2367 numprinted = sprintf(numberresult, fmt,
2368 va_arg(count, int));
2369 n += numprinted;
2370 /* advance by +1 to skip over the '\0' */
2371 numberresult += (numprinted + 1);
2372 assert(*(numberresult - 1) == '\0');
2373 assert(*(numberresult - 2) != '\0');
2374 assert(numprinted >= 0);
2375 assert(numberresult <= numberresults + numbersize);
2376 break;
2377 case 'u':
2378 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2379 width, precision, 'u');
2380 if (longflag)
2381 numprinted = sprintf(numberresult, fmt,
2382 va_arg(count, unsigned long));
2383#ifdef HAVE_LONG_LONG
2384 else if (longlongflag)
2385 numprinted = sprintf(numberresult, fmt,
2386 va_arg(count, unsigned PY_LONG_LONG));
2387#endif
2388 else if (size_tflag)
2389 numprinted = sprintf(numberresult, fmt,
2390 va_arg(count, size_t));
2391 else
2392 numprinted = sprintf(numberresult, fmt,
2393 va_arg(count, unsigned int));
2394 n += numprinted;
2395 numberresult += (numprinted + 1);
2396 assert(*(numberresult - 1) == '\0');
2397 assert(*(numberresult - 2) != '\0');
2398 assert(numprinted >= 0);
2399 assert(numberresult <= numberresults + numbersize);
2400 break;
2401 case 'x':
2402 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2403 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2404 n += numprinted;
2405 numberresult += (numprinted + 1);
2406 assert(*(numberresult - 1) == '\0');
2407 assert(*(numberresult - 2) != '\0');
2408 assert(numprinted >= 0);
2409 assert(numberresult <= numberresults + numbersize);
2410 break;
2411 case 'p':
2412 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2413 /* %p is ill-defined: ensure leading 0x. */
2414 if (numberresult[1] == 'X')
2415 numberresult[1] = 'x';
2416 else if (numberresult[1] != 'x') {
2417 memmove(numberresult + 2, numberresult,
2418 strlen(numberresult) + 1);
2419 numberresult[0] = '0';
2420 numberresult[1] = 'x';
2421 numprinted += 2;
2422 }
2423 n += numprinted;
2424 numberresult += (numprinted + 1);
2425 assert(*(numberresult - 1) == '\0');
2426 assert(*(numberresult - 2) != '\0');
2427 assert(numprinted >= 0);
2428 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002429 break;
2430 case 's':
2431 {
2432 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002433 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002434 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002435 if (!str)
2436 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 /* since PyUnicode_DecodeUTF8 returns already flexible
2438 unicode objects, there is no need to call ready on them */
2439 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002440 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002442 /* Remember the str and switch to the next slot */
2443 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002444 break;
2445 }
2446 case 'U':
2447 {
2448 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002449 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 if (PyUnicode_READY(obj) == -1)
2451 goto fail;
2452 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002453 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002455 break;
2456 }
2457 case 'V':
2458 {
2459 PyObject *obj = va_arg(count, PyObject *);
2460 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002461 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002462 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002463 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002464 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 if (PyUnicode_READY(obj) == -1)
2466 goto fail;
2467 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002468 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002470 *callresult++ = NULL;
2471 }
2472 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002473 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002474 if (!str_obj)
2475 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002476 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002477 Py_DECREF(str_obj);
2478 goto fail;
2479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002481 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002483 *callresult++ = str_obj;
2484 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002485 break;
2486 }
2487 case 'S':
2488 {
2489 PyObject *obj = va_arg(count, PyObject *);
2490 PyObject *str;
2491 assert(obj);
2492 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002493 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002494 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002495 if (PyUnicode_READY(str) == -1) {
2496 Py_DECREF(str);
2497 goto fail;
2498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002500 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002502 /* Remember the str and switch to the next slot */
2503 *callresult++ = str;
2504 break;
2505 }
2506 case 'R':
2507 {
2508 PyObject *obj = va_arg(count, PyObject *);
2509 PyObject *repr;
2510 assert(obj);
2511 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002512 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002513 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002514 if (PyUnicode_READY(repr) == -1) {
2515 Py_DECREF(repr);
2516 goto fail;
2517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002519 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002521 /* Remember the repr and switch to the next slot */
2522 *callresult++ = repr;
2523 break;
2524 }
2525 case 'A':
2526 {
2527 PyObject *obj = va_arg(count, PyObject *);
2528 PyObject *ascii;
2529 assert(obj);
2530 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002531 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002532 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002533 if (PyUnicode_READY(ascii) == -1) {
2534 Py_DECREF(ascii);
2535 goto fail;
2536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002538 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002540 /* Remember the repr and switch to the next slot */
2541 *callresult++ = ascii;
2542 break;
2543 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 default:
2545 /* if we stumble upon an unknown
2546 formatting code, copy the rest of
2547 the format string to the output
2548 string. (we cannot just skip the
2549 code, since there's no way to know
2550 what's in the argument list) */
2551 n += strlen(p);
2552 goto expand;
2553 }
2554 } else
2555 n++;
2556 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002557 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 we don't have to resize the string.
2561 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002562 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 if (!string)
2564 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 kind = PyUnicode_KIND(string);
2566 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002571 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002572 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002573
2574 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2576 /* checking for == because the last argument could be a empty
2577 string, which causes i to point to end, the assert at the end of
2578 the loop */
2579 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002580
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 switch (*f) {
2582 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002583 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 const int ordinal = va_arg(vargs, int);
2585 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002587 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002588 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 case 'p':
2593 /* unused, since we already have the result */
2594 if (*f == 'p')
2595 (void) va_arg(vargs, void *);
2596 else
2597 (void) va_arg(vargs, int);
2598 /* extract the result from numberresults and append. */
2599 for (; *numberresult; ++i, ++numberresult)
2600 PyUnicode_WRITE(kind, data, i, *numberresult);
2601 /* skip over the separating '\0' */
2602 assert(*numberresult == '\0');
2603 numberresult++;
2604 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 case 's':
2607 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002608 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002610 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 size = PyUnicode_GET_LENGTH(*callresult);
2612 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002615 /* We're done with the unicode()/repr() => forget it */
2616 Py_DECREF(*callresult);
2617 /* switch to next unicode()/repr() result */
2618 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002619 break;
2620 }
2621 case 'U':
2622 {
2623 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624 Py_ssize_t size;
2625 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2626 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002627 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 break;
2630 }
2631 case 'V':
2632 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002635 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 size = PyUnicode_GET_LENGTH(obj);
2638 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002639 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 size = PyUnicode_GET_LENGTH(*callresult);
2643 assert(PyUnicode_KIND(*callresult) <=
2644 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002645 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002646 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002647 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002649 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 break;
2651 }
2652 case 'S':
2653 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002654 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002656 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 /* unused, since we already have the result */
2658 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002660 copy_characters(string, i, *callresult, 0, size);
2661 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 /* We're done with the unicode()/repr() => forget it */
2663 Py_DECREF(*callresult);
2664 /* switch to next unicode()/repr() result */
2665 ++callresult;
2666 break;
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 break;
2671 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 for (; *p; ++p, ++i)
2673 PyUnicode_WRITE(kind, data, i, *p);
2674 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 goto end;
2676 }
Victor Stinner1205f272010-09-11 00:54:47 +00002677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 else {
2679 assert(i < PyUnicode_GET_LENGTH(string));
2680 PyUnicode_WRITE(kind, data, i++, *f);
2681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002684
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002686 if (callresults)
2687 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002688 if (numberresults)
2689 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002690 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002691 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002692 if (callresults) {
2693 PyObject **callresult2 = callresults;
2694 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002695 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 ++callresult2;
2697 }
2698 PyObject_Free(callresults);
2699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 if (numberresults)
2701 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703}
2704
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705PyObject *
2706PyUnicode_FromFormat(const char *format, ...)
2707{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 PyObject* ret;
2709 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710
2711#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002715#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 ret = PyUnicode_FromFormatV(format, vargs);
2717 va_end(vargs);
2718 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719}
2720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721#ifdef HAVE_WCHAR_H
2722
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2724 convert a Unicode object to a wide character string.
2725
Victor Stinnerd88d9832011-09-06 02:00:05 +02002726 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002727 character) required to convert the unicode object. Ignore size argument.
2728
Victor Stinnerd88d9832011-09-06 02:00:05 +02002729 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002731 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002732static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002733unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002734 wchar_t *w,
2735 Py_ssize_t size)
2736{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002737 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 const wchar_t *wstr;
2739
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002740 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 if (wstr == NULL)
2742 return -1;
2743
Victor Stinner5593d8a2010-10-02 11:11:27 +00002744 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002745 if (size > res)
2746 size = res + 1;
2747 else
2748 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002750 return res;
2751 }
2752 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002754}
2755
2756Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002757PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002758 wchar_t *w,
2759 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760{
2761 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 PyErr_BadInternalCall();
2763 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002765 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766}
2767
Victor Stinner137c34c2010-09-29 10:25:54 +00002768wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002769PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002770 Py_ssize_t *size)
2771{
2772 wchar_t* buffer;
2773 Py_ssize_t buflen;
2774
2775 if (unicode == NULL) {
2776 PyErr_BadInternalCall();
2777 return NULL;
2778 }
2779
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002780 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 if (buflen == -1)
2782 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002783 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002784 PyErr_NoMemory();
2785 return NULL;
2786 }
2787
Victor Stinner137c34c2010-09-29 10:25:54 +00002788 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2789 if (buffer == NULL) {
2790 PyErr_NoMemory();
2791 return NULL;
2792 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002793 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002794 if (buflen == -1)
2795 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002796 if (size != NULL)
2797 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002798 return buffer;
2799}
2800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802
Alexander Belopolsky40018472011-02-26 01:02:56 +00002803PyObject *
2804PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002807 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002808 PyErr_SetString(PyExc_ValueError,
2809 "chr() arg not in range(0x110000)");
2810 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002811 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (ordinal < 256)
2814 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 v = PyUnicode_New(1, ordinal);
2817 if (v == NULL)
2818 return NULL;
2819 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002820 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002822}
2823
Alexander Belopolsky40018472011-02-26 01:02:56 +00002824PyObject *
2825PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002829 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002830 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002831 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 Py_INCREF(obj);
2833 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002834 }
2835 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002836 /* For a Unicode subtype that's not a Unicode object,
2837 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002838 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002839 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002840 PyErr_Format(PyExc_TypeError,
2841 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002842 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002843 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844}
2845
Alexander Belopolsky40018472011-02-26 01:02:56 +00002846PyObject *
2847PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002848 const char *encoding,
2849 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002851 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002852 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002853
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 PyErr_BadInternalCall();
2856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002858
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002859 /* Decoding bytes objects is the most common case and should be fast */
2860 if (PyBytes_Check(obj)) {
2861 if (PyBytes_GET_SIZE(obj) == 0) {
2862 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002863 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002864 }
2865 else {
2866 v = PyUnicode_Decode(
2867 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2868 encoding, errors);
2869 }
2870 return v;
2871 }
2872
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002873 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 PyErr_SetString(PyExc_TypeError,
2875 "decoding str is not supported");
2876 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002877 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002878
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2880 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2881 PyErr_Format(PyExc_TypeError,
2882 "coercing to str: need bytes, bytearray "
2883 "or buffer-like object, %.80s found",
2884 Py_TYPE(obj)->tp_name);
2885 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002886 }
Tim Petersced69f82003-09-16 20:30:58 +00002887
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002888 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002890 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 }
Tim Petersced69f82003-09-16 20:30:58 +00002892 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002894
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002896 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897}
2898
Victor Stinner600d3be2010-06-10 12:00:55 +00002899/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002900 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2901 1 on success. */
2902static int
2903normalize_encoding(const char *encoding,
2904 char *lower,
2905 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002907 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002908 char *l;
2909 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002911 if (encoding == NULL) {
2912 strcpy(lower, "utf-8");
2913 return 1;
2914 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002915 e = encoding;
2916 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002917 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002918 while (*e) {
2919 if (l == l_end)
2920 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002921 if (Py_ISUPPER(*e)) {
2922 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002923 }
2924 else if (*e == '_') {
2925 *l++ = '-';
2926 e++;
2927 }
2928 else {
2929 *l++ = *e++;
2930 }
2931 }
2932 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002933 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002934}
2935
Alexander Belopolsky40018472011-02-26 01:02:56 +00002936PyObject *
2937PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002938 Py_ssize_t size,
2939 const char *encoding,
2940 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002941{
2942 PyObject *buffer = NULL, *unicode;
2943 Py_buffer info;
2944 char lower[11]; /* Enough for any encoding shortcut */
2945
Fred Drakee4315f52000-05-09 19:53:39 +00002946 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002947 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002948 if ((strcmp(lower, "utf-8") == 0) ||
2949 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002950 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002951 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002952 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002953 (strcmp(lower, "iso-8859-1") == 0))
2954 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002955#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002956 else if (strcmp(lower, "mbcs") == 0)
2957 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002958#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002959 else if (strcmp(lower, "ascii") == 0)
2960 return PyUnicode_DecodeASCII(s, size, errors);
2961 else if (strcmp(lower, "utf-16") == 0)
2962 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2963 else if (strcmp(lower, "utf-32") == 0)
2964 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
2967 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002968 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002969 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002970 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002971 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 if (buffer == NULL)
2973 goto onError;
2974 unicode = PyCodec_Decode(buffer, encoding, errors);
2975 if (unicode == NULL)
2976 goto onError;
2977 if (!PyUnicode_Check(unicode)) {
2978 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002979 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002980 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 Py_DECREF(unicode);
2982 goto onError;
2983 }
2984 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002985 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002986
Benjamin Peterson29060642009-01-31 22:14:21 +00002987 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 Py_XDECREF(buffer);
2989 return NULL;
2990}
2991
Alexander Belopolsky40018472011-02-26 01:02:56 +00002992PyObject *
2993PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002994 const char *encoding,
2995 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002996{
2997 PyObject *v;
2998
2999 if (!PyUnicode_Check(unicode)) {
3000 PyErr_BadArgument();
3001 goto onError;
3002 }
3003
3004 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003005 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003006
3007 /* Decode via the codec registry */
3008 v = PyCodec_Decode(unicode, encoding, errors);
3009 if (v == NULL)
3010 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003011 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003014 return NULL;
3015}
3016
Alexander Belopolsky40018472011-02-26 01:02:56 +00003017PyObject *
3018PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003019 const char *encoding,
3020 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003021{
3022 PyObject *v;
3023
3024 if (!PyUnicode_Check(unicode)) {
3025 PyErr_BadArgument();
3026 goto onError;
3027 }
3028
3029 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003031
3032 /* Decode via the codec registry */
3033 v = PyCodec_Decode(unicode, encoding, errors);
3034 if (v == NULL)
3035 goto onError;
3036 if (!PyUnicode_Check(v)) {
3037 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003038 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003039 Py_TYPE(v)->tp_name);
3040 Py_DECREF(v);
3041 goto onError;
3042 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003043 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003046 return NULL;
3047}
3048
Alexander Belopolsky40018472011-02-26 01:02:56 +00003049PyObject *
3050PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003051 Py_ssize_t size,
3052 const char *encoding,
3053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054{
3055 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 unicode = PyUnicode_FromUnicode(s, size);
3058 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3061 Py_DECREF(unicode);
3062 return v;
3063}
3064
Alexander Belopolsky40018472011-02-26 01:02:56 +00003065PyObject *
3066PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003067 const char *encoding,
3068 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003069{
3070 PyObject *v;
3071
3072 if (!PyUnicode_Check(unicode)) {
3073 PyErr_BadArgument();
3074 goto onError;
3075 }
3076
3077 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003079
3080 /* Encode via the codec registry */
3081 v = PyCodec_Encode(unicode, encoding, errors);
3082 if (v == NULL)
3083 goto onError;
3084 return v;
3085
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003087 return NULL;
3088}
3089
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003090static size_t
3091wcstombs_errorpos(const wchar_t *wstr)
3092{
3093 size_t len;
3094#if SIZEOF_WCHAR_T == 2
3095 wchar_t buf[3];
3096#else
3097 wchar_t buf[2];
3098#endif
3099 char outbuf[MB_LEN_MAX];
3100 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003101
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003102#if SIZEOF_WCHAR_T == 2
3103 buf[2] = 0;
3104#else
3105 buf[1] = 0;
3106#endif
3107 start = wstr;
3108 while (*wstr != L'\0')
3109 {
3110 previous = wstr;
3111#if SIZEOF_WCHAR_T == 2
3112 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3113 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3114 {
3115 buf[0] = wstr[0];
3116 buf[1] = wstr[1];
3117 wstr += 2;
3118 }
3119 else {
3120 buf[0] = *wstr;
3121 buf[1] = 0;
3122 wstr++;
3123 }
3124#else
3125 buf[0] = *wstr;
3126 wstr++;
3127#endif
3128 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003129 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003130 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003131 }
3132
3133 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003134 return 0;
3135}
3136
Victor Stinner1b579672011-12-17 05:47:23 +01003137static int
3138locale_error_handler(const char *errors, int *surrogateescape)
3139{
3140 if (errors == NULL) {
3141 *surrogateescape = 0;
3142 return 0;
3143 }
3144
3145 if (strcmp(errors, "strict") == 0) {
3146 *surrogateescape = 0;
3147 return 0;
3148 }
3149 if (strcmp(errors, "surrogateescape") == 0) {
3150 *surrogateescape = 1;
3151 return 0;
3152 }
3153 PyErr_Format(PyExc_ValueError,
3154 "only 'strict' and 'surrogateescape' error handlers "
3155 "are supported, not '%s'",
3156 errors);
3157 return -1;
3158}
3159
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003161PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003162{
3163 Py_ssize_t wlen, wlen2;
3164 wchar_t *wstr;
3165 PyObject *bytes = NULL;
3166 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003167 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168 PyObject *exc;
3169 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003170 int surrogateescape;
3171
3172 if (locale_error_handler(errors, &surrogateescape) < 0)
3173 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003174
3175 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3176 if (wstr == NULL)
3177 return NULL;
3178
3179 wlen2 = wcslen(wstr);
3180 if (wlen2 != wlen) {
3181 PyMem_Free(wstr);
3182 PyErr_SetString(PyExc_TypeError, "embedded null character");
3183 return NULL;
3184 }
3185
3186 if (surrogateescape) {
3187 /* locale encoding with surrogateescape */
3188 char *str;
3189
3190 str = _Py_wchar2char(wstr, &error_pos);
3191 if (str == NULL) {
3192 if (error_pos == (size_t)-1) {
3193 PyErr_NoMemory();
3194 PyMem_Free(wstr);
3195 return NULL;
3196 }
3197 else {
3198 goto encode_error;
3199 }
3200 }
3201 PyMem_Free(wstr);
3202
3203 bytes = PyBytes_FromString(str);
3204 PyMem_Free(str);
3205 }
3206 else {
3207 size_t len, len2;
3208
3209 len = wcstombs(NULL, wstr, 0);
3210 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003211 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003212 goto encode_error;
3213 }
3214
3215 bytes = PyBytes_FromStringAndSize(NULL, len);
3216 if (bytes == NULL) {
3217 PyMem_Free(wstr);
3218 return NULL;
3219 }
3220
3221 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3222 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003223 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003224 goto encode_error;
3225 }
3226 PyMem_Free(wstr);
3227 }
3228 return bytes;
3229
3230encode_error:
3231 errmsg = strerror(errno);
3232 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003233
3234 if (error_pos == (size_t)-1)
3235 error_pos = wcstombs_errorpos(wstr);
3236
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003237 PyMem_Free(wstr);
3238 Py_XDECREF(bytes);
3239
Victor Stinner2f197072011-12-17 07:08:30 +01003240 if (errmsg != NULL) {
3241 size_t errlen;
3242 wstr = _Py_char2wchar(errmsg, &errlen);
3243 if (wstr != NULL) {
3244 reason = PyUnicode_FromWideChar(wstr, errlen);
3245 PyMem_Free(wstr);
3246 } else
3247 errmsg = NULL;
3248 }
3249 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003250 reason = PyUnicode_FromString(
3251 "wcstombs() encountered an unencodable "
3252 "wide character");
3253 if (reason == NULL)
3254 return NULL;
3255
3256 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3257 "locale", unicode,
3258 (Py_ssize_t)error_pos,
3259 (Py_ssize_t)(error_pos+1),
3260 reason);
3261 Py_DECREF(reason);
3262 if (exc != NULL) {
3263 PyCodec_StrictErrors(exc);
3264 Py_XDECREF(exc);
3265 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003266 return NULL;
3267}
3268
Victor Stinnerad158722010-10-27 00:25:46 +00003269PyObject *
3270PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003271{
Victor Stinner99b95382011-07-04 14:23:54 +02003272#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003273 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003274#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003275 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003276#else
Victor Stinner793b5312011-04-27 00:24:21 +02003277 PyInterpreterState *interp = PyThreadState_GET()->interp;
3278 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3279 cannot use it to encode and decode filenames before it is loaded. Load
3280 the Python codec requires to encode at least its own filename. Use the C
3281 version of the locale codec until the codec registry is initialized and
3282 the Python codec is loaded.
3283
3284 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3285 cannot only rely on it: check also interp->fscodec_initialized for
3286 subinterpreters. */
3287 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003288 return PyUnicode_AsEncodedString(unicode,
3289 Py_FileSystemDefaultEncoding,
3290 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003291 }
3292 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003293 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003294 }
Victor Stinnerad158722010-10-27 00:25:46 +00003295#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003296}
3297
Alexander Belopolsky40018472011-02-26 01:02:56 +00003298PyObject *
3299PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003300 const char *encoding,
3301 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302{
3303 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003304 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003305
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 if (!PyUnicode_Check(unicode)) {
3307 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 }
Fred Drakee4315f52000-05-09 19:53:39 +00003310
Fred Drakee4315f52000-05-09 19:53:39 +00003311 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003312 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003313 if ((strcmp(lower, "utf-8") == 0) ||
3314 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003315 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003316 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003318 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003319 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003320 }
Victor Stinner37296e82010-06-10 13:36:23 +00003321 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003322 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003323 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003324 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003325#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003326 else if (strcmp(lower, "mbcs") == 0)
3327 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003328#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003329 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003330 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332
3333 /* Encode via the codec registry */
3334 v = PyCodec_Encode(unicode, encoding, errors);
3335 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003336 return NULL;
3337
3338 /* The normal path */
3339 if (PyBytes_Check(v))
3340 return v;
3341
3342 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003343 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003344 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003345 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003346
3347 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3348 "encoder %s returned bytearray instead of bytes",
3349 encoding);
3350 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003351 Py_DECREF(v);
3352 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003353 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003354
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003355 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3356 Py_DECREF(v);
3357 return b;
3358 }
3359
3360 PyErr_Format(PyExc_TypeError,
3361 "encoder did not return a bytes object (type=%.400s)",
3362 Py_TYPE(v)->tp_name);
3363 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003364 return NULL;
3365}
3366
Alexander Belopolsky40018472011-02-26 01:02:56 +00003367PyObject *
3368PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003369 const char *encoding,
3370 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003371{
3372 PyObject *v;
3373
3374 if (!PyUnicode_Check(unicode)) {
3375 PyErr_BadArgument();
3376 goto onError;
3377 }
3378
3379 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003381
3382 /* Encode via the codec registry */
3383 v = PyCodec_Encode(unicode, encoding, errors);
3384 if (v == NULL)
3385 goto onError;
3386 if (!PyUnicode_Check(v)) {
3387 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003388 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389 Py_TYPE(v)->tp_name);
3390 Py_DECREF(v);
3391 goto onError;
3392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003394
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 return NULL;
3397}
3398
Victor Stinner2f197072011-12-17 07:08:30 +01003399static size_t
3400mbstowcs_errorpos(const char *str, size_t len)
3401{
3402#ifdef HAVE_MBRTOWC
3403 const char *start = str;
3404 mbstate_t mbs;
3405 size_t converted;
3406 wchar_t ch;
3407
3408 memset(&mbs, 0, sizeof mbs);
3409 while (len)
3410 {
3411 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3412 if (converted == 0)
3413 /* Reached end of string */
3414 break;
3415 if (converted == (size_t)-1 || converted == (size_t)-2) {
3416 /* Conversion error or incomplete character */
3417 return str - start;
3418 }
3419 else {
3420 str += converted;
3421 len -= converted;
3422 }
3423 }
3424 /* failed to find the undecodable byte sequence */
3425 return 0;
3426#endif
3427 return 0;
3428}
3429
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003430PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003431PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003432 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003433{
3434 wchar_t smallbuf[256];
3435 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3436 wchar_t *wstr;
3437 size_t wlen, wlen2;
3438 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003439 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003440 size_t error_pos;
3441 char *errmsg;
3442 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003443
3444 if (locale_error_handler(errors, &surrogateescape) < 0)
3445 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003446
3447 if (str[len] != '\0' || len != strlen(str)) {
3448 PyErr_SetString(PyExc_TypeError, "embedded null character");
3449 return NULL;
3450 }
3451
3452 if (surrogateescape)
3453 {
3454 wstr = _Py_char2wchar(str, &wlen);
3455 if (wstr == NULL) {
3456 if (wlen == (size_t)-1)
3457 PyErr_NoMemory();
3458 else
3459 PyErr_SetFromErrno(PyExc_OSError);
3460 return NULL;
3461 }
3462
3463 unicode = PyUnicode_FromWideChar(wstr, wlen);
3464 PyMem_Free(wstr);
3465 }
3466 else {
3467#ifndef HAVE_BROKEN_MBSTOWCS
3468 wlen = mbstowcs(NULL, str, 0);
3469#else
3470 wlen = len;
3471#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003472 if (wlen == (size_t)-1)
3473 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003474 if (wlen+1 <= smallbuf_len) {
3475 wstr = smallbuf;
3476 }
3477 else {
3478 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3479 return PyErr_NoMemory();
3480
3481 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3482 if (!wstr)
3483 return PyErr_NoMemory();
3484 }
3485
3486 /* This shouldn't fail now */
3487 wlen2 = mbstowcs(wstr, str, wlen+1);
3488 if (wlen2 == (size_t)-1) {
3489 if (wstr != smallbuf)
3490 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003491 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003492 }
3493#ifdef HAVE_BROKEN_MBSTOWCS
3494 assert(wlen2 == wlen);
3495#endif
3496 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3497 if (wstr != smallbuf)
3498 PyMem_Free(wstr);
3499 }
3500 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003501
3502decode_error:
3503 errmsg = strerror(errno);
3504 assert(errmsg != NULL);
3505
3506 error_pos = mbstowcs_errorpos(str, len);
3507 if (errmsg != NULL) {
3508 size_t errlen;
3509 wstr = _Py_char2wchar(errmsg, &errlen);
3510 if (wstr != NULL) {
3511 reason = PyUnicode_FromWideChar(wstr, errlen);
3512 PyMem_Free(wstr);
3513 } else
3514 errmsg = NULL;
3515 }
3516 if (errmsg == NULL)
3517 reason = PyUnicode_FromString(
3518 "mbstowcs() encountered an invalid multibyte sequence");
3519 if (reason == NULL)
3520 return NULL;
3521
3522 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3523 "locale", str, len,
3524 (Py_ssize_t)error_pos,
3525 (Py_ssize_t)(error_pos+1),
3526 reason);
3527 Py_DECREF(reason);
3528 if (exc != NULL) {
3529 PyCodec_StrictErrors(exc);
3530 Py_XDECREF(exc);
3531 }
3532 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003533}
3534
3535PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003536PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003537{
3538 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003539 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003540}
3541
3542
3543PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003544PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003545 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003546 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3547}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003548
Christian Heimes5894ba72007-11-04 11:43:14 +00003549PyObject*
3550PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3551{
Victor Stinner99b95382011-07-04 14:23:54 +02003552#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003553 return PyUnicode_DecodeMBCS(s, size, NULL);
3554#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003555 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003556#else
Victor Stinner793b5312011-04-27 00:24:21 +02003557 PyInterpreterState *interp = PyThreadState_GET()->interp;
3558 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3559 cannot use it to encode and decode filenames before it is loaded. Load
3560 the Python codec requires to encode at least its own filename. Use the C
3561 version of the locale codec until the codec registry is initialized and
3562 the Python codec is loaded.
3563
3564 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3565 cannot only rely on it: check also interp->fscodec_initialized for
3566 subinterpreters. */
3567 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003568 return PyUnicode_Decode(s, size,
3569 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003570 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003571 }
3572 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003573 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003574 }
Victor Stinnerad158722010-10-27 00:25:46 +00003575#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003576}
3577
Martin v. Löwis011e8422009-05-05 04:43:17 +00003578
3579int
Antoine Pitrou13348842012-01-29 18:36:34 +01003580_PyUnicode_HasNULChars(PyObject* s)
3581{
3582 static PyObject *nul = NULL;
3583
3584 if (nul == NULL)
3585 nul = PyUnicode_FromStringAndSize("\0", 1);
3586 if (nul == NULL)
3587 return -1;
3588 return PyUnicode_Contains(s, nul);
3589}
3590
3591
3592int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003593PyUnicode_FSConverter(PyObject* arg, void* addr)
3594{
3595 PyObject *output = NULL;
3596 Py_ssize_t size;
3597 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003598 if (arg == NULL) {
3599 Py_DECREF(*(PyObject**)addr);
3600 return 1;
3601 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003602 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003603 output = arg;
3604 Py_INCREF(output);
3605 }
3606 else {
3607 arg = PyUnicode_FromObject(arg);
3608 if (!arg)
3609 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003610 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003611 Py_DECREF(arg);
3612 if (!output)
3613 return 0;
3614 if (!PyBytes_Check(output)) {
3615 Py_DECREF(output);
3616 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3617 return 0;
3618 }
3619 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003620 size = PyBytes_GET_SIZE(output);
3621 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003622 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003623 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003624 Py_DECREF(output);
3625 return 0;
3626 }
3627 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003628 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003629}
3630
3631
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003632int
3633PyUnicode_FSDecoder(PyObject* arg, void* addr)
3634{
3635 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003636 if (arg == NULL) {
3637 Py_DECREF(*(PyObject**)addr);
3638 return 1;
3639 }
3640 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003641 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003642 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003643 output = arg;
3644 Py_INCREF(output);
3645 }
3646 else {
3647 arg = PyBytes_FromObject(arg);
3648 if (!arg)
3649 return 0;
3650 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3651 PyBytes_GET_SIZE(arg));
3652 Py_DECREF(arg);
3653 if (!output)
3654 return 0;
3655 if (!PyUnicode_Check(output)) {
3656 Py_DECREF(output);
3657 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3658 return 0;
3659 }
3660 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003661 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003662 Py_DECREF(output);
3663 return 0;
3664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003666 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003667 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3668 Py_DECREF(output);
3669 return 0;
3670 }
3671 *(PyObject**)addr = output;
3672 return Py_CLEANUP_SUPPORTED;
3673}
3674
3675
Martin v. Löwis5b222132007-06-10 09:51:05 +00003676char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003677PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003678{
Christian Heimesf3863112007-11-22 07:46:41 +00003679 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003681 if (!PyUnicode_Check(unicode)) {
3682 PyErr_BadArgument();
3683 return NULL;
3684 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003685 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003686 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003688 if (PyUnicode_UTF8(unicode) == NULL) {
3689 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003690 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3691 if (bytes == NULL)
3692 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003693 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3694 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003695 Py_DECREF(bytes);
3696 return NULL;
3697 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003698 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3699 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3700 PyBytes_AS_STRING(bytes),
3701 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003702 Py_DECREF(bytes);
3703 }
3704
3705 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003706 *psize = PyUnicode_UTF8_LENGTH(unicode);
3707 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003708}
3709
3710char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3714}
3715
3716#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003717static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718#endif
3719
3720
3721Py_UNICODE *
3722PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3723{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003724 const unsigned char *one_byte;
3725#if SIZEOF_WCHAR_T == 4
3726 const Py_UCS2 *two_bytes;
3727#else
3728 const Py_UCS4 *four_bytes;
3729 const Py_UCS4 *ucs4_end;
3730 Py_ssize_t num_surrogates;
3731#endif
3732 wchar_t *w;
3733 wchar_t *wchar_end;
3734
3735 if (!PyUnicode_Check(unicode)) {
3736 PyErr_BadArgument();
3737 return NULL;
3738 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003739 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003741 assert(_PyUnicode_KIND(unicode) != 0);
3742 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743
3744#ifdef Py_DEBUG
3745 ++unicode_as_unicode_calls;
3746#endif
3747
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003748 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3751 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 num_surrogates = 0;
3753
3754 for (; four_bytes < ucs4_end; ++four_bytes) {
3755 if (*four_bytes > 0xFFFF)
3756 ++num_surrogates;
3757 }
3758
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3760 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3761 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762 PyErr_NoMemory();
3763 return NULL;
3764 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003767 w = _PyUnicode_WSTR(unicode);
3768 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3769 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3771 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003772 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003774 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3775 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 }
3777 else
3778 *w = *four_bytes;
3779
3780 if (w > wchar_end) {
3781 assert(0 && "Miscalculated string end");
3782 }
3783 }
3784 *w = 0;
3785#else
3786 /* sizeof(wchar_t) == 4 */
3787 Py_FatalError("Impossible unicode object state, wstr and str "
3788 "should share memory already.");
3789 return NULL;
3790#endif
3791 }
3792 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003793 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3794 (_PyUnicode_LENGTH(unicode) + 1));
3795 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 PyErr_NoMemory();
3797 return NULL;
3798 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003799 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3800 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3801 w = _PyUnicode_WSTR(unicode);
3802 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003804 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3805 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 for (; w < wchar_end; ++one_byte, ++w)
3807 *w = *one_byte;
3808 /* null-terminate the wstr */
3809 *w = 0;
3810 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 for (; w < wchar_end; ++two_bytes, ++w)
3815 *w = *two_bytes;
3816 /* null-terminate the wstr */
3817 *w = 0;
3818#else
3819 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003820 PyObject_FREE(_PyUnicode_WSTR(unicode));
3821 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 Py_FatalError("Impossible unicode object state, wstr "
3823 "and str should share memory already.");
3824 return NULL;
3825#endif
3826 }
3827 else {
3828 assert(0 && "This should never happen.");
3829 }
3830 }
3831 }
3832 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 *size = PyUnicode_WSTR_LENGTH(unicode);
3834 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003835}
3836
Alexander Belopolsky40018472011-02-26 01:02:56 +00003837Py_UNICODE *
3838PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841}
3842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843
Alexander Belopolsky40018472011-02-26 01:02:56 +00003844Py_ssize_t
3845PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846{
3847 if (!PyUnicode_Check(unicode)) {
3848 PyErr_BadArgument();
3849 goto onError;
3850 }
3851 return PyUnicode_GET_SIZE(unicode);
3852
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 return -1;
3855}
3856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857Py_ssize_t
3858PyUnicode_GetLength(PyObject *unicode)
3859{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003860 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 PyErr_BadArgument();
3862 return -1;
3863 }
3864
3865 return PyUnicode_GET_LENGTH(unicode);
3866}
3867
3868Py_UCS4
3869PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3870{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003871 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3872 PyErr_BadArgument();
3873 return (Py_UCS4)-1;
3874 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003875 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003876 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 return (Py_UCS4)-1;
3878 }
3879 return PyUnicode_READ_CHAR(unicode, index);
3880}
3881
3882int
3883PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3884{
3885 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003886 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 return -1;
3888 }
Victor Stinner488fa492011-12-12 00:01:39 +01003889 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003890 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003891 PyErr_SetString(PyExc_IndexError, "string index out of range");
3892 return -1;
3893 }
Victor Stinner488fa492011-12-12 00:01:39 +01003894 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003895 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3897 index, ch);
3898 return 0;
3899}
3900
Alexander Belopolsky40018472011-02-26 01:02:56 +00003901const char *
3902PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003903{
Victor Stinner42cb4622010-09-01 19:39:01 +00003904 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003905}
3906
Victor Stinner554f3f02010-06-16 23:33:54 +00003907/* create or adjust a UnicodeDecodeError */
3908static void
3909make_decode_exception(PyObject **exceptionObject,
3910 const char *encoding,
3911 const char *input, Py_ssize_t length,
3912 Py_ssize_t startpos, Py_ssize_t endpos,
3913 const char *reason)
3914{
3915 if (*exceptionObject == NULL) {
3916 *exceptionObject = PyUnicodeDecodeError_Create(
3917 encoding, input, length, startpos, endpos, reason);
3918 }
3919 else {
3920 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3921 goto onError;
3922 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3923 goto onError;
3924 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3925 goto onError;
3926 }
3927 return;
3928
3929onError:
3930 Py_DECREF(*exceptionObject);
3931 *exceptionObject = NULL;
3932}
3933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934/* error handling callback helper:
3935 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003936 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937 and adjust various state variables.
3938 return 0 on success, -1 on error
3939*/
3940
Alexander Belopolsky40018472011-02-26 01:02:56 +00003941static int
3942unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003943 const char *encoding, const char *reason,
3944 const char **input, const char **inend, Py_ssize_t *startinpos,
3945 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003946 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003948 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949
3950 PyObject *restuple = NULL;
3951 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003952 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003953 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t requiredsize;
3955 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003956 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 int res = -1;
3958
Victor Stinner596a6c42011-11-09 00:02:18 +01003959 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3960 outsize = PyUnicode_GET_LENGTH(*output);
3961 else
3962 outsize = _PyUnicode_WSTR_LENGTH(*output);
3963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 *errorHandler = PyCodec_LookupError(errors);
3966 if (*errorHandler == NULL)
3967 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 }
3969
Victor Stinner554f3f02010-06-16 23:33:54 +00003970 make_decode_exception(exceptionObject,
3971 encoding,
3972 *input, *inend - *input,
3973 *startinpos, *endinpos,
3974 reason);
3975 if (*exceptionObject == NULL)
3976 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977
3978 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3979 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003982 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 }
3985 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003987 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003988 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003989
3990 /* Copy back the bytes variables, which might have been modified by the
3991 callback */
3992 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3993 if (!inputobj)
3994 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003995 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003997 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003998 *input = PyBytes_AS_STRING(inputobj);
3999 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004000 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004001 /* we can DECREF safely, as the exception has another reference,
4002 so the object won't go away. */
4003 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004007 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4009 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004010 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011
Victor Stinner596a6c42011-11-09 00:02:18 +01004012 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4013 /* need more space? (at least enough for what we
4014 have+the replacement+the rest of the string (starting
4015 at the new input position), so we won't have to check space
4016 when there are no errors in the rest of the string) */
4017 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4018 requiredsize = *outpos + replen + insize-newpos;
4019 if (requiredsize > outsize) {
4020 if (requiredsize<2*outsize)
4021 requiredsize = 2*outsize;
4022 if (unicode_resize(output, requiredsize) < 0)
4023 goto onError;
4024 }
4025 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004027 copy_characters(*output, *outpos, repunicode, 0, replen);
4028 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004030 else {
4031 wchar_t *repwstr;
4032 Py_ssize_t repwlen;
4033 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4034 if (repwstr == NULL)
4035 goto onError;
4036 /* need more space? (at least enough for what we
4037 have+the replacement+the rest of the string (starting
4038 at the new input position), so we won't have to check space
4039 when there are no errors in the rest of the string) */
4040 requiredsize = *outpos + repwlen + insize-newpos;
4041 if (requiredsize > outsize) {
4042 if (requiredsize < 2*outsize)
4043 requiredsize = 2*outsize;
4044 if (unicode_resize(output, requiredsize) < 0)
4045 goto onError;
4046 }
4047 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4048 *outpos += repwlen;
4049 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004051 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 /* we made it! */
4054 res = 0;
4055
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 Py_XDECREF(restuple);
4058 return res;
4059}
4060
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004061/* --- UTF-7 Codec -------------------------------------------------------- */
4062
Antoine Pitrou244651a2009-05-04 18:56:13 +00004063/* See RFC2152 for details. We encode conservatively and decode liberally. */
4064
4065/* Three simple macros defining base-64. */
4066
4067/* Is c a base-64 character? */
4068
4069#define IS_BASE64(c) \
4070 (((c) >= 'A' && (c) <= 'Z') || \
4071 ((c) >= 'a' && (c) <= 'z') || \
4072 ((c) >= '0' && (c) <= '9') || \
4073 (c) == '+' || (c) == '/')
4074
4075/* given that c is a base-64 character, what is its base-64 value? */
4076
4077#define FROM_BASE64(c) \
4078 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4079 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4080 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4081 (c) == '+' ? 62 : 63)
4082
4083/* What is the base-64 character of the bottom 6 bits of n? */
4084
4085#define TO_BASE64(n) \
4086 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4087
4088/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4089 * decoded as itself. We are permissive on decoding; the only ASCII
4090 * byte not decoding to itself is the + which begins a base64
4091 * string. */
4092
4093#define DECODE_DIRECT(c) \
4094 ((c) <= 127 && (c) != '+')
4095
4096/* The UTF-7 encoder treats ASCII characters differently according to
4097 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4098 * the above). See RFC2152. This array identifies these different
4099 * sets:
4100 * 0 : "Set D"
4101 * alphanumeric and '(),-./:?
4102 * 1 : "Set O"
4103 * !"#$%&*;<=>@[]^_`{|}
4104 * 2 : "whitespace"
4105 * ht nl cr sp
4106 * 3 : special (must be base64 encoded)
4107 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4108 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004109
Tim Petersced69f82003-09-16 20:30:58 +00004110static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004111char utf7_category[128] = {
4112/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4113 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4114/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4115 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4116/* sp ! " # $ % & ' ( ) * + , - . / */
4117 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4118/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4120/* @ A B C D E F G H I J K L M N O */
4121 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4122/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4124/* ` a b c d e f g h i j k l m n o */
4125 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4126/* p q r s t u v w x y z { | } ~ del */
4127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004128};
4129
Antoine Pitrou244651a2009-05-04 18:56:13 +00004130/* ENCODE_DIRECT: this character should be encoded as itself. The
4131 * answer depends on whether we are encoding set O as itself, and also
4132 * on whether we are encoding whitespace as itself. RFC2152 makes it
4133 * clear that the answers to these questions vary between
4134 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004135
Antoine Pitrou244651a2009-05-04 18:56:13 +00004136#define ENCODE_DIRECT(c, directO, directWS) \
4137 ((c) < 128 && (c) > 0 && \
4138 ((utf7_category[(c)] == 0) || \
4139 (directWS && (utf7_category[(c)] == 2)) || \
4140 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004141
Alexander Belopolsky40018472011-02-26 01:02:56 +00004142PyObject *
4143PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004144 Py_ssize_t size,
4145 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004146{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004147 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4148}
4149
Antoine Pitrou244651a2009-05-04 18:56:13 +00004150/* The decoder. The only state we preserve is our read position,
4151 * i.e. how many characters we have consumed. So if we end in the
4152 * middle of a shift sequence we have to back off the read position
4153 * and the output to the beginning of the sequence, otherwise we lose
4154 * all the shift state (seen bits, number of bits seen, high
4155 * surrogate). */
4156
Alexander Belopolsky40018472011-02-26 01:02:56 +00004157PyObject *
4158PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004159 Py_ssize_t size,
4160 const char *errors,
4161 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004162{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004164 Py_ssize_t startinpos;
4165 Py_ssize_t endinpos;
4166 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004167 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004168 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004169 const char *errmsg = "";
4170 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004171 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004172 unsigned int base64bits = 0;
4173 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004174 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 PyObject *errorHandler = NULL;
4176 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004177
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004178 /* Start off assuming it's all ASCII. Widen later as necessary. */
4179 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004180 if (!unicode)
4181 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004182 if (size == 0) {
4183 if (consumed)
4184 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004185 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004186 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004187
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004188 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004189 e = s + size;
4190
4191 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004192 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004193 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004194 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004195
Antoine Pitrou244651a2009-05-04 18:56:13 +00004196 if (inShift) { /* in a base-64 section */
4197 if (IS_BASE64(ch)) { /* consume a base-64 character */
4198 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4199 base64bits += 6;
4200 s++;
4201 if (base64bits >= 16) {
4202 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004203 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004204 base64bits -= 16;
4205 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4206 if (surrogate) {
4207 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004208 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4209 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004210 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4211 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004212 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004213 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004214 }
4215 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004216 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4217 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004218 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219 }
4220 }
Victor Stinner551ac952011-11-29 22:58:13 +01004221 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004222 /* first surrogate */
4223 surrogate = outCh;
4224 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004225 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004226 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4227 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004228 }
4229 }
4230 }
4231 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004232 inShift = 0;
4233 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004234 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004235 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4236 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004237 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004238 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004239 if (base64bits > 0) { /* left-over bits */
4240 if (base64bits >= 6) {
4241 /* We've seen at least one base-64 character */
4242 errmsg = "partial character in shift sequence";
4243 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004244 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004245 else {
4246 /* Some bits remain; they should be zero */
4247 if (base64buffer != 0) {
4248 errmsg = "non-zero padding bits in shift sequence";
4249 goto utf7Error;
4250 }
4251 }
4252 }
4253 if (ch != '-') {
4254 /* '-' is absorbed; other terminating
4255 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004256 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4257 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004258 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004259 }
4260 }
4261 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263 s++; /* consume '+' */
4264 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004265 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004266 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4267 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004268 }
4269 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004271 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004273 }
4274 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004276 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4277 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004278 s++;
4279 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004280 else {
4281 startinpos = s-starts;
4282 s++;
4283 errmsg = "unexpected special character";
4284 goto utf7Error;
4285 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 endinpos = s-starts;
4289 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 errors, &errorHandler,
4291 "utf7", errmsg,
4292 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004293 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004295 }
4296
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297 /* end of string */
4298
4299 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4300 /* if we're in an inconsistent state, that's an error */
4301 if (surrogate ||
4302 (base64bits >= 6) ||
4303 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004304 endinpos = size;
4305 if (unicode_decode_call_errorhandler(
4306 errors, &errorHandler,
4307 "utf7", "unterminated shift sequence",
4308 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004309 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310 goto onError;
4311 if (s < e)
4312 goto restart;
4313 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004314 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315
4316 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004319 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004320 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 }
4322 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004323 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004325 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004326
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328 goto onError;
4329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 Py_XDECREF(errorHandler);
4331 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004332 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 Py_XDECREF(errorHandler);
4336 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004337 Py_DECREF(unicode);
4338 return NULL;
4339}
4340
4341
Alexander Belopolsky40018472011-02-26 01:02:56 +00004342PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004343_PyUnicode_EncodeUTF7(PyObject *str,
4344 int base64SetO,
4345 int base64WhiteSpace,
4346 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004348 int kind;
4349 void *data;
4350 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004351 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004352 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004354 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 unsigned int base64bits = 0;
4356 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357 char * out;
4358 char * start;
4359
Benjamin Petersonbac79492012-01-14 13:34:47 -05004360 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004361 return NULL;
4362 kind = PyUnicode_KIND(str);
4363 data = PyUnicode_DATA(str);
4364 len = PyUnicode_GET_LENGTH(str);
4365
4366 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004369 /* It might be possible to tighten this worst case */
4370 allocated = 8 * len;
4371 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004372 return PyErr_NoMemory();
4373
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375 if (v == NULL)
4376 return NULL;
4377
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004378 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004379 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004380 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 if (inShift) {
4383 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4384 /* shifting out */
4385 if (base64bits) { /* output remaining bits */
4386 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4387 base64buffer = 0;
4388 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
4390 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 /* Characters not in the BASE64 set implicitly unshift the sequence
4392 so no '-' is required, except if the character is itself a '-' */
4393 if (IS_BASE64(ch) || ch == '-') {
4394 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 *out++ = (char) ch;
4397 }
4398 else {
4399 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 else { /* not in a shift sequence */
4403 if (ch == '+') {
4404 *out++ = '+';
4405 *out++ = '-';
4406 }
4407 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4408 *out++ = (char) ch;
4409 }
4410 else {
4411 *out++ = '+';
4412 inShift = 1;
4413 goto encode_char;
4414 }
4415 }
4416 continue;
4417encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004419 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004420
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 /* code first surrogate */
4422 base64bits += 16;
4423 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4424 while (base64bits >= 6) {
4425 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4426 base64bits -= 6;
4427 }
4428 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004429 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004431 base64bits += 16;
4432 base64buffer = (base64buffer << 16) | ch;
4433 while (base64bits >= 6) {
4434 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4435 base64bits -= 6;
4436 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004437 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 if (base64bits)
4439 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4440 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004442 if (_PyBytes_Resize(&v, out - start) < 0)
4443 return NULL;
4444 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004445}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004446PyObject *
4447PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4448 Py_ssize_t size,
4449 int base64SetO,
4450 int base64WhiteSpace,
4451 const char *errors)
4452{
4453 PyObject *result;
4454 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4455 if (tmp == NULL)
4456 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004457 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004458 base64WhiteSpace, errors);
4459 Py_DECREF(tmp);
4460 return result;
4461}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463#undef IS_BASE64
4464#undef FROM_BASE64
4465#undef TO_BASE64
4466#undef DECODE_DIRECT
4467#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469/* --- UTF-8 Codec -------------------------------------------------------- */
4470
Tim Petersced69f82003-09-16 20:30:58 +00004471static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004473 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4474 illegal prefix. See RFC 3629 for details */
4475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4480 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4481 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004482 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4487 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4488 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4489 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4490 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491};
4492
Alexander Belopolsky40018472011-02-26 01:02:56 +00004493PyObject *
4494PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004495 Py_ssize_t size,
4496 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497{
Walter Dörwald69652032004-09-07 20:24:22 +00004498 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4499}
4500
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004501#include "stringlib/ucs1lib.h"
4502#include "stringlib/codecs.h"
4503#include "stringlib/undef.h"
4504
4505#include "stringlib/ucs2lib.h"
4506#include "stringlib/codecs.h"
4507#include "stringlib/undef.h"
4508
4509#include "stringlib/ucs4lib.h"
4510#include "stringlib/codecs.h"
4511#include "stringlib/undef.h"
4512
Antoine Pitrouab868312009-01-10 15:40:25 +00004513/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4514#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4515
4516/* Mask to quickly check whether a C 'long' contains a
4517 non-ASCII, UTF8-encoded char. */
4518#if (SIZEOF_LONG == 8)
4519# define ASCII_CHAR_MASK 0x8080808080808080L
4520#elif (SIZEOF_LONG == 4)
4521# define ASCII_CHAR_MASK 0x80808080L
4522#else
4523# error C 'long' size should be either 4 or 8!
4524#endif
4525
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004526/* Scans a UTF-8 string and returns the maximum character to be expected
4527 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004528
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004529 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004530 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531 */
4532static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004533utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004535 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004536 const unsigned char *end = p + string_size;
4537 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004538
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004539 assert(unicode_size != NULL);
4540
4541 /* By having a cascade of independent loops which fallback onto each
4542 other, we minimize the amount of work done in the average loop
4543 iteration, and we also maximize the CPU's ability to predict
4544 branches correctly (because a given condition will have always the
4545 same boolean outcome except perhaps in the last iteration of the
4546 corresponding loop).
4547 In the general case this brings us rather close to decoding
4548 performance pre-PEP 393, despite the two-pass decoding.
4549
4550 Note that the pure ASCII loop is not duplicated once a non-ASCII
4551 character has been encountered. It is actually a pessimization (by
4552 a significant factor) to use this loop on text with many non-ASCII
4553 characters, and it is important to avoid bad performance on valid
4554 utf-8 data (invalid utf-8 being a different can of worms).
4555 */
4556
4557 /* ASCII */
4558 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004559 /* Only check value if it's not a ASCII char... */
4560 if (*p < 0x80) {
4561 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4562 an explanation. */
4563 if (!((size_t) p & LONG_PTR_MASK)) {
4564 /* Help register allocation */
4565 register const unsigned char *_p = p;
4566 while (_p < aligned_end) {
4567 unsigned long value = *(unsigned long *) _p;
4568 if (value & ASCII_CHAR_MASK)
4569 break;
4570 _p += SIZEOF_LONG;
4571 char_count += SIZEOF_LONG;
4572 }
4573 p = _p;
4574 if (p == end)
4575 break;
4576 }
4577 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004578 if (*p < 0x80)
4579 ++char_count;
4580 else
4581 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004582 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004583 *unicode_size = char_count;
4584 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004585
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004586_ucs1loop:
4587 for (; p < end; ++p) {
4588 if (*p < 0xc4)
4589 char_count += ((*p & 0xc0) != 0x80);
4590 else
4591 goto _ucs2loop;
4592 }
4593 *unicode_size = char_count;
4594 return 255;
4595
4596_ucs2loop:
4597 for (; p < end; ++p) {
4598 if (*p < 0xf0)
4599 char_count += ((*p & 0xc0) != 0x80);
4600 else
4601 goto _ucs4loop;
4602 }
4603 *unicode_size = char_count;
4604 return 65535;
4605
4606_ucs4loop:
4607 for (; p < end; ++p) {
4608 char_count += ((*p & 0xc0) != 0x80);
4609 }
4610 *unicode_size = char_count;
4611 return 65537;
4612}
4613
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004615 in case of errors. Implicit parameters: unicode, kind, data, onError.
4616 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004617*/
Victor Stinner785938e2011-12-11 20:09:03 +01004618#define WRITE_MAYBE_FAIL(index, value) \
4619 do { \
4620 Py_ssize_t pos = index; \
4621 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4622 unicode_resize(&unicode, pos + pos/8) < 0) \
4623 goto onError; \
4624 if (unicode_putchar(&unicode, &pos, value) < 0) \
4625 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004626 } while (0)
4627
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004628static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004629decode_utf8_errors(const char *starts,
4630 Py_ssize_t size,
4631 const char *errors,
4632 Py_ssize_t *consumed,
4633 const char *s,
4634 PyObject *unicode,
4635 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004636{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004638 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004639 Py_ssize_t startinpos;
4640 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004641 const char *e = starts + size;
4642 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004643 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 PyObject *errorHandler = NULL;
4645 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004646
Antoine Pitrouab868312009-01-10 15:40:25 +00004647 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648
4649 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004650 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651
4652 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004653 /* Fast path for runs of ASCII characters. Given that common UTF-8
4654 input will consist of an overwhelming majority of ASCII
4655 characters, we try to optimize for this case by checking
4656 as many characters as a C 'long' can contain.
4657 First, check if we can do an aligned read, as most CPUs have
4658 a penalty for unaligned reads.
4659 */
4660 if (!((size_t) s & LONG_PTR_MASK)) {
4661 /* Help register allocation */
4662 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004663 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004664 while (_s < aligned_end) {
4665 /* Read a whole long at a time (either 4 or 8 bytes),
4666 and do a fast unrolled copy if it only contains ASCII
4667 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004668 unsigned long value = *(unsigned long *) _s;
4669 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004670 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004671 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4672 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4673 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4674 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004675#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004676 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4677 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4678 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4679 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004680#endif
4681 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004683 }
4684 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004685 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004686 if (s == e)
4687 break;
4688 ch = (unsigned char)*s;
4689 }
4690 }
4691
4692 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004693 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 s++;
4695 continue;
4696 }
4697
4698 n = utf8_code_length[ch];
4699
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004700 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 if (consumed)
4702 break;
4703 else {
4704 errmsg = "unexpected end of data";
4705 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004706 endinpos = startinpos+1;
4707 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4708 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 goto utf8Error;
4710 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712
4713 switch (n) {
4714
4715 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004716 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004717 startinpos = s-starts;
4718 endinpos = startinpos+1;
4719 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720
4721 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004722 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 startinpos = s-starts;
4724 endinpos = startinpos+1;
4725 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726
4727 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004728 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004729 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004731 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 goto utf8Error;
4733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004735 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004736 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 break;
4738
4739 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004740 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4741 will result in surrogates in range d800-dfff. Surrogates are
4742 not valid UTF-8 so they are rejected.
4743 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4744 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004745 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004746 (s[2] & 0xc0) != 0x80 ||
4747 ((unsigned char)s[0] == 0xE0 &&
4748 (unsigned char)s[1] < 0xA0) ||
4749 ((unsigned char)s[0] == 0xED &&
4750 (unsigned char)s[1] > 0x9F)) {
4751 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004753 endinpos = startinpos + 1;
4754
4755 /* if s[1] first two bits are 1 and 0, then the invalid
4756 continuation byte is s[2], so increment endinpos by 1,
4757 if not, s[1] is invalid and endinpos doesn't need to
4758 be incremented. */
4759 if ((s[1] & 0xC0) == 0x80)
4760 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 goto utf8Error;
4762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004764 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004765 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004766 break;
4767
4768 case 4:
4769 if ((s[1] & 0xc0) != 0x80 ||
4770 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004771 (s[3] & 0xc0) != 0x80 ||
4772 ((unsigned char)s[0] == 0xF0 &&
4773 (unsigned char)s[1] < 0x90) ||
4774 ((unsigned char)s[0] == 0xF4 &&
4775 (unsigned char)s[1] > 0x8F)) {
4776 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004778 endinpos = startinpos + 1;
4779 if ((s[1] & 0xC0) == 0x80) {
4780 endinpos++;
4781 if ((s[2] & 0xC0) == 0x80)
4782 endinpos++;
4783 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 goto utf8Error;
4785 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004786 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004787 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004788 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004789
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004790 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 }
4793 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004795
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 if (unicode_decode_call_errorhandler(
4798 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004799 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004801 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004803 /* Update data because unicode_decode_call_errorhandler might have
4804 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 }
Walter Dörwald69652032004-09-07 20:24:22 +00004807 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004810 /* Adjust length and ready string when it contained errors and
4811 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004812 if (unicode_resize(&unicode, i) < 0)
4813 goto onError;
4814 unicode_adjust_maxchar(&unicode);
4815 if (unicode == NULL)
4816 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 Py_XDECREF(errorHandler);
4819 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004820 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004821 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822
Benjamin Peterson29060642009-01-31 22:14:21 +00004823 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 Py_XDECREF(errorHandler);
4825 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004826 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 return NULL;
4828}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004829#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004830
Victor Stinner785938e2011-12-11 20:09:03 +01004831PyObject *
4832PyUnicode_DecodeUTF8Stateful(const char *s,
4833 Py_ssize_t size,
4834 const char *errors,
4835 Py_ssize_t *consumed)
4836{
4837 Py_UCS4 maxchar = 0;
4838 Py_ssize_t unicode_size;
4839 int has_errors = 0;
4840 PyObject *unicode;
4841 int kind;
4842 void *data;
4843 const char *starts = s;
4844 const char *e;
4845 Py_ssize_t i;
4846
4847 if (size == 0) {
4848 if (consumed)
4849 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004850 Py_INCREF(unicode_empty);
4851 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004852 }
4853
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004854 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004855
4856 /* When the string is ASCII only, just use memcpy and return.
4857 unicode_size may be != size if there is an incomplete UTF-8
4858 sequence at the end of the ASCII block. */
4859 if (maxchar < 128 && size == unicode_size) {
4860 if (consumed)
4861 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004862 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004863 }
4864
4865 unicode = PyUnicode_New(unicode_size, maxchar);
4866 if (!unicode)
4867 return NULL;
4868 kind = PyUnicode_KIND(unicode);
4869 data = PyUnicode_DATA(unicode);
4870
4871 /* Unpack UTF-8 encoded data */
4872 i = 0;
4873 e = starts + size;
4874 switch (kind) {
4875 case PyUnicode_1BYTE_KIND:
4876 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4877 break;
4878 case PyUnicode_2BYTE_KIND:
4879 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4880 break;
4881 case PyUnicode_4BYTE_KIND:
4882 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4883 break;
4884 }
4885 if (!has_errors) {
4886 /* Ensure the unicode size calculation was correct */
4887 assert(i == unicode_size);
4888 assert(s == e);
4889 if (consumed)
4890 *consumed = size;
4891 return unicode;
4892 }
4893
4894 /* In case of errors, maxchar and size computation might be incorrect;
4895 code below refits and resizes as necessary. */
4896 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4897}
4898
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004899#ifdef __APPLE__
4900
4901/* Simplified UTF-8 decoder using surrogateescape error handler,
4902 used to decode the command line arguments on Mac OS X. */
4903
4904wchar_t*
4905_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4906{
4907 int n;
4908 const char *e;
4909 wchar_t *unicode, *p;
4910
4911 /* Note: size will always be longer than the resulting Unicode
4912 character count */
4913 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4914 PyErr_NoMemory();
4915 return NULL;
4916 }
4917 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4918 if (!unicode)
4919 return NULL;
4920
4921 /* Unpack UTF-8 encoded data */
4922 p = unicode;
4923 e = s + size;
4924 while (s < e) {
4925 Py_UCS4 ch = (unsigned char)*s;
4926
4927 if (ch < 0x80) {
4928 *p++ = (wchar_t)ch;
4929 s++;
4930 continue;
4931 }
4932
4933 n = utf8_code_length[ch];
4934 if (s + n > e) {
4935 goto surrogateescape;
4936 }
4937
4938 switch (n) {
4939 case 0:
4940 case 1:
4941 goto surrogateescape;
4942
4943 case 2:
4944 if ((s[1] & 0xc0) != 0x80)
4945 goto surrogateescape;
4946 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4947 assert ((ch > 0x007F) && (ch <= 0x07FF));
4948 *p++ = (wchar_t)ch;
4949 break;
4950
4951 case 3:
4952 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4953 will result in surrogates in range d800-dfff. Surrogates are
4954 not valid UTF-8 so they are rejected.
4955 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4956 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4957 if ((s[1] & 0xc0) != 0x80 ||
4958 (s[2] & 0xc0) != 0x80 ||
4959 ((unsigned char)s[0] == 0xE0 &&
4960 (unsigned char)s[1] < 0xA0) ||
4961 ((unsigned char)s[0] == 0xED &&
4962 (unsigned char)s[1] > 0x9F)) {
4963
4964 goto surrogateescape;
4965 }
4966 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4967 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004968 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004969 break;
4970
4971 case 4:
4972 if ((s[1] & 0xc0) != 0x80 ||
4973 (s[2] & 0xc0) != 0x80 ||
4974 (s[3] & 0xc0) != 0x80 ||
4975 ((unsigned char)s[0] == 0xF0 &&
4976 (unsigned char)s[1] < 0x90) ||
4977 ((unsigned char)s[0] == 0xF4 &&
4978 (unsigned char)s[1] > 0x8F)) {
4979 goto surrogateescape;
4980 }
4981 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4982 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004983 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984
4985#if SIZEOF_WCHAR_T == 4
4986 *p++ = (wchar_t)ch;
4987#else
4988 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004989 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4990 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004991#endif
4992 break;
4993 }
4994 s += n;
4995 continue;
4996
4997 surrogateescape:
4998 *p++ = 0xDC00 + ch;
4999 s++;
5000 }
5001 *p = L'\0';
5002 return unicode;
5003}
5004
5005#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005007/* Primary internal function which creates utf8 encoded bytes objects.
5008
5009 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005010 and allocate exactly as much space needed at the end. Else allocate the
5011 maximum possible needed (4 result bytes per Unicode character), and return
5012 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005013*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005014PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005015_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016{
Victor Stinner6099a032011-12-18 14:22:26 +01005017 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005018 void *data;
5019 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005021 if (!PyUnicode_Check(unicode)) {
5022 PyErr_BadArgument();
5023 return NULL;
5024 }
5025
5026 if (PyUnicode_READY(unicode) == -1)
5027 return NULL;
5028
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005029 if (PyUnicode_UTF8(unicode))
5030 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5031 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005032
5033 kind = PyUnicode_KIND(unicode);
5034 data = PyUnicode_DATA(unicode);
5035 size = PyUnicode_GET_LENGTH(unicode);
5036
Benjamin Petersonead6b532011-12-20 17:23:42 -06005037 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005038 default:
5039 assert(0);
5040 case PyUnicode_1BYTE_KIND:
5041 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5042 assert(!PyUnicode_IS_ASCII(unicode));
5043 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5044 case PyUnicode_2BYTE_KIND:
5045 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5046 case PyUnicode_4BYTE_KIND:
5047 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049}
5050
Alexander Belopolsky40018472011-02-26 01:02:56 +00005051PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005052PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5053 Py_ssize_t size,
5054 const char *errors)
5055{
5056 PyObject *v, *unicode;
5057
5058 unicode = PyUnicode_FromUnicode(s, size);
5059 if (unicode == NULL)
5060 return NULL;
5061 v = _PyUnicode_AsUTF8String(unicode, errors);
5062 Py_DECREF(unicode);
5063 return v;
5064}
5065
5066PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005067PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070}
5071
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072/* --- UTF-32 Codec ------------------------------------------------------- */
5073
5074PyObject *
5075PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 Py_ssize_t size,
5077 const char *errors,
5078 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079{
5080 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5081}
5082
5083PyObject *
5084PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 Py_ssize_t size,
5086 const char *errors,
5087 int *byteorder,
5088 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089{
5090 const char *starts = s;
5091 Py_ssize_t startinpos;
5092 Py_ssize_t endinpos;
5093 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005094 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005095 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 int bo = 0; /* assume native ordering by default */
5097 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 /* Offsets from q for retrieving bytes in the right order. */
5099#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5100 int iorder[] = {0, 1, 2, 3};
5101#else
5102 int iorder[] = {3, 2, 1, 0};
5103#endif
5104 PyObject *errorHandler = NULL;
5105 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005106
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107 q = (unsigned char *)s;
5108 e = q + size;
5109
5110 if (byteorder)
5111 bo = *byteorder;
5112
5113 /* Check for BOM marks (U+FEFF) in the input and adjust current
5114 byte order setting accordingly. In native mode, the leading BOM
5115 mark is skipped, in all other modes, it is copied to the output
5116 stream as-is (giving a ZWNBSP character). */
5117 if (bo == 0) {
5118 if (size >= 4) {
5119 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 if (bom == 0x0000FEFF) {
5123 q += 4;
5124 bo = -1;
5125 }
5126 else if (bom == 0xFFFE0000) {
5127 q += 4;
5128 bo = 1;
5129 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 if (bom == 0x0000FEFF) {
5132 q += 4;
5133 bo = 1;
5134 }
5135 else if (bom == 0xFFFE0000) {
5136 q += 4;
5137 bo = -1;
5138 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005139#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141 }
5142
5143 if (bo == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
5149 }
5150 else if (bo == 1) {
5151 /* force BE */
5152 iorder[0] = 3;
5153 iorder[1] = 2;
5154 iorder[2] = 1;
5155 iorder[3] = 0;
5156 }
5157
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005158 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005159 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005160 if (!unicode)
5161 return NULL;
5162 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005163 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005164 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005165
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 Py_UCS4 ch;
5168 /* remaining bytes at the end? (size should be divisible by 4) */
5169 if (e-q<4) {
5170 if (consumed)
5171 break;
5172 errmsg = "truncated data";
5173 startinpos = ((const char *)q)-starts;
5174 endinpos = ((const char *)e)-starts;
5175 goto utf32Error;
5176 /* The remaining input chars are ignored if the callback
5177 chooses to skip the input */
5178 }
5179 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5180 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005181
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 if (ch >= 0x110000)
5183 {
5184 errmsg = "codepoint not in range(0x110000)";
5185 startinpos = ((const char *)q)-starts;
5186 endinpos = startinpos+4;
5187 goto utf32Error;
5188 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005189 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5190 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 q += 4;
5192 continue;
5193 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 if (unicode_decode_call_errorhandler(
5195 errors, &errorHandler,
5196 "utf32", errmsg,
5197 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005198 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200 }
5201
5202 if (byteorder)
5203 *byteorder = bo;
5204
5205 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005207
5208 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005209 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005210 goto onError;
5211
5212 Py_XDECREF(errorHandler);
5213 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005214 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005215
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005217 Py_DECREF(unicode);
5218 Py_XDECREF(errorHandler);
5219 Py_XDECREF(exc);
5220 return NULL;
5221}
5222
5223PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005224_PyUnicode_EncodeUTF32(PyObject *str,
5225 const char *errors,
5226 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005227{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005228 int kind;
5229 void *data;
5230 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005231 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005233 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234 /* Offsets from p for storing byte pairs in the right order. */
5235#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5236 int iorder[] = {0, 1, 2, 3};
5237#else
5238 int iorder[] = {3, 2, 1, 0};
5239#endif
5240
Benjamin Peterson29060642009-01-31 22:14:21 +00005241#define STORECHAR(CH) \
5242 do { \
5243 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5244 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5245 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5246 p[iorder[0]] = (CH) & 0xff; \
5247 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248 } while(0)
5249
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005250 if (!PyUnicode_Check(str)) {
5251 PyErr_BadArgument();
5252 return NULL;
5253 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005254 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005255 return NULL;
5256 kind = PyUnicode_KIND(str);
5257 data = PyUnicode_DATA(str);
5258 len = PyUnicode_GET_LENGTH(str);
5259
5260 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005261 bytesize = nsize * 4;
5262 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005264 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005265 if (v == NULL)
5266 return NULL;
5267
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005268 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005269 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005271 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005272 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005273
5274 if (byteorder == -1) {
5275 /* force LE */
5276 iorder[0] = 0;
5277 iorder[1] = 1;
5278 iorder[2] = 2;
5279 iorder[3] = 3;
5280 }
5281 else if (byteorder == 1) {
5282 /* force BE */
5283 iorder[0] = 3;
5284 iorder[1] = 2;
5285 iorder[2] = 1;
5286 iorder[3] = 0;
5287 }
5288
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005289 for (i = 0; i < len; i++)
5290 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005291
5292 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005293 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005294#undef STORECHAR
5295}
5296
Alexander Belopolsky40018472011-02-26 01:02:56 +00005297PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005298PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5299 Py_ssize_t size,
5300 const char *errors,
5301 int byteorder)
5302{
5303 PyObject *result;
5304 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5305 if (tmp == NULL)
5306 return NULL;
5307 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5308 Py_DECREF(tmp);
5309 return result;
5310}
5311
5312PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005313PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314{
Victor Stinnerb960b342011-11-20 19:12:52 +01005315 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005316}
5317
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318/* --- UTF-16 Codec ------------------------------------------------------- */
5319
Tim Peters772747b2001-08-09 22:21:55 +00005320PyObject *
5321PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 Py_ssize_t size,
5323 const char *errors,
5324 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325{
Walter Dörwald69652032004-09-07 20:24:22 +00005326 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5327}
5328
Antoine Pitrouab868312009-01-10 15:40:25 +00005329/* Two masks for fast checking of whether a C 'long' may contain
5330 UTF16-encoded surrogate characters. This is an efficient heuristic,
5331 assuming that non-surrogate characters with a code point >= 0x8000 are
5332 rare in most input.
5333 FAST_CHAR_MASK is used when the input is in native byte ordering,
5334 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005335*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005336#if (SIZEOF_LONG == 8)
5337# define FAST_CHAR_MASK 0x8000800080008000L
5338# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5339#elif (SIZEOF_LONG == 4)
5340# define FAST_CHAR_MASK 0x80008000L
5341# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5342#else
5343# error C 'long' size should be either 4 or 8!
5344#endif
5345
Walter Dörwald69652032004-09-07 20:24:22 +00005346PyObject *
5347PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 Py_ssize_t size,
5349 const char *errors,
5350 int *byteorder,
5351 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005352{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 Py_ssize_t startinpos;
5355 Py_ssize_t endinpos;
5356 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005357 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005358 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005359 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005360 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005361 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005362 /* Offsets from q for retrieving byte pairs in the right order. */
5363#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5364 int ihi = 1, ilo = 0;
5365#else
5366 int ihi = 0, ilo = 1;
5367#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368 PyObject *errorHandler = NULL;
5369 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
5371 /* Note: size will always be longer than the resulting Unicode
5372 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005373 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 if (!unicode)
5375 return NULL;
5376 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005377 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005378 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379
Tim Peters772747b2001-08-09 22:21:55 +00005380 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005381 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382
5383 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005384 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005386 /* Check for BOM marks (U+FEFF) in the input and adjust current
5387 byte order setting accordingly. In native mode, the leading BOM
5388 mark is skipped, in all other modes, it is copied to the output
5389 stream as-is (giving a ZWNBSP character). */
5390 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005391 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005392 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 if (bom == 0xFEFF) {
5395 q += 2;
5396 bo = -1;
5397 }
5398 else if (bom == 0xFFFE) {
5399 q += 2;
5400 bo = 1;
5401 }
Tim Petersced69f82003-09-16 20:30:58 +00005402#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 if (bom == 0xFEFF) {
5404 q += 2;
5405 bo = 1;
5406 }
5407 else if (bom == 0xFFFE) {
5408 q += 2;
5409 bo = -1;
5410 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005411#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Tim Peters772747b2001-08-09 22:21:55 +00005415 if (bo == -1) {
5416 /* force LE */
5417 ihi = 1;
5418 ilo = 0;
5419 }
5420 else if (bo == 1) {
5421 /* force BE */
5422 ihi = 0;
5423 ilo = 1;
5424 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005425#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5426 native_ordering = ilo < ihi;
5427#else
5428 native_ordering = ilo > ihi;
5429#endif
Tim Peters772747b2001-08-09 22:21:55 +00005430
Antoine Pitrouab868312009-01-10 15:40:25 +00005431 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005432 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005433 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005434 /* First check for possible aligned read of a C 'long'. Unaligned
5435 reads are more expensive, better to defer to another iteration. */
5436 if (!((size_t) q & LONG_PTR_MASK)) {
5437 /* Fast path for runs of non-surrogate chars. */
5438 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005439 int kind = PyUnicode_KIND(unicode);
5440 void *data = PyUnicode_DATA(unicode);
5441 while (_q < aligned_end) {
5442 unsigned long block = * (unsigned long *) _q;
5443 unsigned short *pblock = (unsigned short*)&block;
5444 Py_UCS4 maxch;
5445 if (native_ordering) {
5446 /* Can use buffer directly */
5447 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005448 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005449 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005450 else {
5451 /* Need to byte-swap */
5452 unsigned char *_p = (unsigned char*)pblock;
5453 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005454 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005455 _p[0] = _q[1];
5456 _p[1] = _q[0];
5457 _p[2] = _q[3];
5458 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005459#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005460 _p[4] = _q[5];
5461 _p[5] = _q[4];
5462 _p[6] = _q[7];
5463 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005464#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005465 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005466 maxch = Py_MAX(pblock[0], pblock[1]);
5467#if SIZEOF_LONG == 8
5468 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5469#endif
5470 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5471 if (unicode_widen(&unicode, maxch) < 0)
5472 goto onError;
5473 kind = PyUnicode_KIND(unicode);
5474 data = PyUnicode_DATA(unicode);
5475 }
5476 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5477 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5478#if SIZEOF_LONG == 8
5479 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5480 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5481#endif
5482 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005483 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005484 q = _q;
5485 if (q >= e)
5486 break;
5487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489
Benjamin Peterson14339b62009-01-31 16:36:08 +00005490 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005491
Victor Stinner551ac952011-11-29 22:58:13 +01005492 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005493 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5494 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 continue;
5496 }
5497
5498 /* UTF-16 code pair: */
5499 if (q > e) {
5500 errmsg = "unexpected end of data";
5501 startinpos = (((const char *)q) - 2) - starts;
5502 endinpos = ((const char *)e) + 1 - starts;
5503 goto utf16Error;
5504 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005505 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5506 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005508 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005509 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005510 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005511 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 continue;
5513 }
5514 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005515 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 startinpos = (((const char *)q)-4)-starts;
5517 endinpos = startinpos+2;
5518 goto utf16Error;
5519 }
5520
Benjamin Peterson14339b62009-01-31 16:36:08 +00005521 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 errmsg = "illegal encoding";
5523 startinpos = (((const char *)q)-2)-starts;
5524 endinpos = startinpos+2;
5525 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005526
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005529 errors,
5530 &errorHandler,
5531 "utf16", errmsg,
5532 &starts,
5533 (const char **)&e,
5534 &startinpos,
5535 &endinpos,
5536 &exc,
5537 (const char **)&q,
5538 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005539 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005542 /* remaining byte at the end? (size should be even) */
5543 if (e == q) {
5544 if (!consumed) {
5545 errmsg = "truncated data";
5546 startinpos = ((const char *)q) - starts;
5547 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005548 if (unicode_decode_call_errorhandler(
5549 errors,
5550 &errorHandler,
5551 "utf16", errmsg,
5552 &starts,
5553 (const char **)&e,
5554 &startinpos,
5555 &endinpos,
5556 &exc,
5557 (const char **)&q,
5558 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005560 goto onError;
5561 /* The remaining input chars are ignored if the callback
5562 chooses to skip the input */
5563 }
5564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565
5566 if (byteorder)
5567 *byteorder = bo;
5568
Walter Dörwald69652032004-09-07 20:24:22 +00005569 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005571
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005573 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 goto onError;
5575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 Py_XDECREF(errorHandler);
5577 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005578 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582 Py_XDECREF(errorHandler);
5583 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 return NULL;
5585}
5586
Antoine Pitrouab868312009-01-10 15:40:25 +00005587#undef FAST_CHAR_MASK
5588#undef SWAPPED_FAST_CHAR_MASK
5589
Tim Peters772747b2001-08-09 22:21:55 +00005590PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005591_PyUnicode_EncodeUTF16(PyObject *str,
5592 const char *errors,
5593 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005595 int kind;
5596 void *data;
5597 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005598 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005599 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005600 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005601 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005602 /* Offsets from p for storing byte pairs in the right order. */
5603#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5604 int ihi = 1, ilo = 0;
5605#else
5606 int ihi = 0, ilo = 1;
5607#endif
5608
Benjamin Peterson29060642009-01-31 22:14:21 +00005609#define STORECHAR(CH) \
5610 do { \
5611 p[ihi] = ((CH) >> 8) & 0xff; \
5612 p[ilo] = (CH) & 0xff; \
5613 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005614 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005616 if (!PyUnicode_Check(str)) {
5617 PyErr_BadArgument();
5618 return NULL;
5619 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005620 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005621 return NULL;
5622 kind = PyUnicode_KIND(str);
5623 data = PyUnicode_DATA(str);
5624 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005625
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005626 pairs = 0;
5627 if (kind == PyUnicode_4BYTE_KIND)
5628 for (i = 0; i < len; i++)
5629 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5630 pairs++;
5631 /* 2 * (len + pairs + (byteorder == 0)) */
5632 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005634 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005635 bytesize = nsize * 2;
5636 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005638 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 if (v == NULL)
5640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005642 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005644 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005645 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005646 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005647
5648 if (byteorder == -1) {
5649 /* force LE */
5650 ihi = 1;
5651 ilo = 0;
5652 }
5653 else if (byteorder == 1) {
5654 /* force BE */
5655 ihi = 0;
5656 ilo = 1;
5657 }
5658
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005659 for (i = 0; i < len; i++) {
5660 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5661 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005663 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5664 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 }
Tim Peters772747b2001-08-09 22:21:55 +00005666 STORECHAR(ch);
5667 if (ch2)
5668 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005669 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005670
5671 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005672 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005673#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674}
5675
Alexander Belopolsky40018472011-02-26 01:02:56 +00005676PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005677PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5678 Py_ssize_t size,
5679 const char *errors,
5680 int byteorder)
5681{
5682 PyObject *result;
5683 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5684 if (tmp == NULL)
5685 return NULL;
5686 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5687 Py_DECREF(tmp);
5688 return result;
5689}
5690
5691PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005692PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005694 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695}
5696
5697/* --- Unicode Escape Codec ----------------------------------------------- */
5698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005699/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5700 if all the escapes in the string make it still a valid ASCII string.
5701 Returns -1 if any escapes were found which cause the string to
5702 pop out of ASCII range. Otherwise returns the length of the
5703 required buffer to hold the string.
5704 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005705static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005706length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5707{
5708 const unsigned char *p = (const unsigned char *)s;
5709 const unsigned char *end = p + size;
5710 Py_ssize_t length = 0;
5711
5712 if (size < 0)
5713 return -1;
5714
5715 for (; p < end; ++p) {
5716 if (*p > 127) {
5717 /* Non-ASCII */
5718 return -1;
5719 }
5720 else if (*p != '\\') {
5721 /* Normal character */
5722 ++length;
5723 }
5724 else {
5725 /* Backslash-escape, check next char */
5726 ++p;
5727 /* Escape sequence reaches till end of string or
5728 non-ASCII follow-up. */
5729 if (p >= end || *p > 127)
5730 return -1;
5731 switch (*p) {
5732 case '\n':
5733 /* backslash + \n result in zero characters */
5734 break;
5735 case '\\': case '\'': case '\"':
5736 case 'b': case 'f': case 't':
5737 case 'n': case 'r': case 'v': case 'a':
5738 ++length;
5739 break;
5740 case '0': case '1': case '2': case '3':
5741 case '4': case '5': case '6': case '7':
5742 case 'x': case 'u': case 'U': case 'N':
5743 /* these do not guarantee ASCII characters */
5744 return -1;
5745 default:
5746 /* count the backslash + the other character */
5747 length += 2;
5748 }
5749 }
5750 }
5751 return length;
5752}
5753
Fredrik Lundh06d12682001-01-24 07:59:11 +00005754static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005755
Alexander Belopolsky40018472011-02-26 01:02:56 +00005756PyObject *
5757PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005758 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005759 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005762 Py_ssize_t startinpos;
5763 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005764 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005765 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005767 char* message;
5768 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 PyObject *errorHandler = NULL;
5770 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005771 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005772 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005773
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005774 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005775
5776 /* After length_of_escaped_ascii_string() there are two alternatives,
5777 either the string is pure ASCII with named escapes like \n, etc.
5778 and we determined it's exact size (common case)
5779 or it contains \x, \u, ... escape sequences. then we create a
5780 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005781 if (len >= 0) {
5782 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 if (!v)
5784 goto onError;
5785 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 }
5787 else {
5788 /* Escaped strings will always be longer than the resulting
5789 Unicode string, so we start with size here and then reduce the
5790 length after conversion to the true value.
5791 (but if the error callback returns a long replacement string
5792 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005793 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005794 if (!v)
5795 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005796 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 }
5798
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005800 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005801 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005803
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 while (s < end) {
5805 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005806 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005809 /* The only case in which i == ascii_length is a backslash
5810 followed by a newline. */
5811 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005812
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 /* Non-escape characters are interpreted as Unicode ordinals */
5814 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005815 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5816 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 continue;
5818 }
5819
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 /* \ - Escapes */
5822 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005823 c = *s++;
5824 if (s > end)
5825 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005826
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005827 /* The only case in which i == ascii_length is a backslash
5828 followed by a newline. */
5829 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005830
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005831 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005834#define WRITECHAR(ch) \
5835 do { \
5836 if (unicode_putchar(&v, &i, ch) < 0) \
5837 goto onError; \
5838 }while(0)
5839
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005841 case '\\': WRITECHAR('\\'); break;
5842 case '\'': WRITECHAR('\''); break;
5843 case '\"': WRITECHAR('\"'); break;
5844 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005845 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005846 case 'f': WRITECHAR('\014'); break;
5847 case 't': WRITECHAR('\t'); break;
5848 case 'n': WRITECHAR('\n'); break;
5849 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005850 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005851 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005852 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005853 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 case '0': case '1': case '2': case '3':
5857 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005858 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005859 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005860 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005861 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005862 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005864 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 break;
5866
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 /* hex escapes */
5868 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005870 digits = 2;
5871 message = "truncated \\xXX escape";
5872 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005876 digits = 4;
5877 message = "truncated \\uXXXX escape";
5878 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005881 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005882 digits = 8;
5883 message = "truncated \\UXXXXXXXX escape";
5884 hexescape:
5885 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 if (s+digits>end) {
5887 endinpos = size;
5888 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 errors, &errorHandler,
5890 "unicodeescape", "end of string in escape sequence",
5891 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 goto onError;
5894 goto nextByte;
5895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005896 for (j = 0; j < digits; ++j) {
5897 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005898 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005899 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 errors, &errorHandler,
5902 "unicodeescape", message,
5903 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005905 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005906 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005908 }
5909 chr = (chr<<4) & ~0xF;
5910 if (c >= '0' && c <= '9')
5911 chr += c - '0';
5912 else if (c >= 'a' && c <= 'f')
5913 chr += 10 + c - 'a';
5914 else
5915 chr += 10 + c - 'A';
5916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005917 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005918 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 /* _decoding_error will have already written into the
5920 target buffer. */
5921 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005922 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005923 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005924 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005925 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005926 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005928 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 errors, &errorHandler,
5930 "unicodeescape", "illegal Unicode character",
5931 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005932 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005933 goto onError;
5934 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005935 break;
5936
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005938 case 'N':
5939 message = "malformed \\N character escape";
5940 if (ucnhash_CAPI == NULL) {
5941 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005942 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5943 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005944 if (ucnhash_CAPI == NULL)
5945 goto ucnhashError;
5946 }
5947 if (*s == '{') {
5948 const char *start = s+1;
5949 /* look for the closing brace */
5950 while (*s != '}' && s < end)
5951 s++;
5952 if (s > start && s < end && *s == '}') {
5953 /* found a name. look it up in the unicode database */
5954 message = "unknown Unicode character name";
5955 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005956 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005957 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005958 goto store;
5959 }
5960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005962 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 errors, &errorHandler,
5964 "unicodeescape", message,
5965 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005966 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005967 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005968 break;
5969
5970 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005971 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972 message = "\\ at end of string";
5973 s--;
5974 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 errors, &errorHandler,
5977 "unicodeescape", message,
5978 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005979 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005980 goto onError;
5981 }
5982 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005983 WRITECHAR('\\');
5984 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005985 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005986 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005991#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005992
Victor Stinner16e6a802011-12-12 13:24:15 +01005993 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005994 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005995 Py_XDECREF(errorHandler);
5996 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005997 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005998
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006000 PyErr_SetString(
6001 PyExc_UnicodeError,
6002 "\\N escapes not supported (can't load unicodedata module)"
6003 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006004 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005 Py_XDECREF(errorHandler);
6006 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006007 return NULL;
6008
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 Py_XDECREF(errorHandler);
6012 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 return NULL;
6014}
6015
6016/* Return a Unicode-Escape string version of the Unicode object.
6017
6018 If quotes is true, the string is enclosed in u"" or u'' quotes as
6019 appropriate.
6020
6021*/
6022
Alexander Belopolsky40018472011-02-26 01:02:56 +00006023PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006026 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006027 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006029 int kind;
6030 void *data;
6031 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Thomas Wouters89f507f2006-12-13 04:49:30 +00006033 /* Initial allocation is based on the longest-possible unichr
6034 escape.
6035
6036 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6037 unichr, so in this case it's the longest unichr escape. In
6038 narrow (UTF-16) builds this is five chars per source unichr
6039 since there are two unichrs in the surrogate pair, so in narrow
6040 (UTF-16) builds it's not the longest unichr escape.
6041
6042 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6043 so in the narrow (UTF-16) build case it's the longest unichr
6044 escape.
6045 */
6046
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006047 if (!PyUnicode_Check(unicode)) {
6048 PyErr_BadArgument();
6049 return NULL;
6050 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006051 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052 return NULL;
6053 len = PyUnicode_GET_LENGTH(unicode);
6054 kind = PyUnicode_KIND(unicode);
6055 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006056 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006057 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6058 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6059 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6060 }
6061
6062 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006063 return PyBytes_FromStringAndSize(NULL, 0);
6064
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006065 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006067
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006068 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006070 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 if (repr == NULL)
6073 return NULL;
6074
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006075 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006077 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006078 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006079
Walter Dörwald79e913e2007-05-12 11:08:06 +00006080 /* Escape backslashes */
6081 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 *p++ = '\\';
6083 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006084 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006085 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006086
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006087 /* Map 21-bit characters to '\U00xxxxxx' */
6088 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006089 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006090 *p++ = '\\';
6091 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006092 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6093 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6094 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6095 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6096 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6097 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6098 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6099 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006101 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006102
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006104 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 *p++ = '\\';
6106 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006107 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6108 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6109 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6110 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006112
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006113 /* Map special whitespace to '\t', \n', '\r' */
6114 else if (ch == '\t') {
6115 *p++ = '\\';
6116 *p++ = 't';
6117 }
6118 else if (ch == '\n') {
6119 *p++ = '\\';
6120 *p++ = 'n';
6121 }
6122 else if (ch == '\r') {
6123 *p++ = '\\';
6124 *p++ = 'r';
6125 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006126
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006127 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006128 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006130 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006131 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6132 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006133 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006134
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 /* Copy everything else as-is */
6136 else
6137 *p++ = (char) ch;
6138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006140 assert(p - PyBytes_AS_STRING(repr) > 0);
6141 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6142 return NULL;
6143 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144}
6145
Alexander Belopolsky40018472011-02-26 01:02:56 +00006146PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6148 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150 PyObject *result;
6151 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6152 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154 result = PyUnicode_AsUnicodeEscapeString(tmp);
6155 Py_DECREF(tmp);
6156 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157}
6158
6159/* --- Raw Unicode Escape Codec ------------------------------------------- */
6160
Alexander Belopolsky40018472011-02-26 01:02:56 +00006161PyObject *
6162PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006163 Py_ssize_t size,
6164 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006166 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006167 Py_ssize_t startinpos;
6168 Py_ssize_t endinpos;
6169 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006170 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 const char *end;
6172 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173 PyObject *errorHandler = NULL;
6174 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006175
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 /* Escaped strings will always be longer than the resulting
6177 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178 length after conversion to the true value. (But decoding error
6179 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006180 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006184 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006185 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 end = s + size;
6187 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 unsigned char c;
6189 Py_UCS4 x;
6190 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006191 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 /* Non-escape characters are interpreted as Unicode ordinals */
6194 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006195 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6196 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 startinpos = s-starts;
6200
6201 /* \u-escapes are only interpreted iff the number of leading
6202 backslashes if odd */
6203 bs = s;
6204 for (;s < end;) {
6205 if (*s != '\\')
6206 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006207 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6208 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 }
6210 if (((s - bs) & 1) == 0 ||
6211 s >= end ||
6212 (*s != 'u' && *s != 'U')) {
6213 continue;
6214 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006215 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 count = *s=='u' ? 4 : 8;
6217 s++;
6218
6219 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 for (x = 0, i = 0; i < count; ++i, ++s) {
6221 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006222 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 endinpos = s-starts;
6224 if (unicode_decode_call_errorhandler(
6225 errors, &errorHandler,
6226 "rawunicodeescape", "truncated \\uXXXX",
6227 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006228 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 goto onError;
6230 goto nextByte;
6231 }
6232 x = (x<<4) & ~0xF;
6233 if (c >= '0' && c <= '9')
6234 x += c - '0';
6235 else if (c >= 'a' && c <= 'f')
6236 x += 10 + c - 'a';
6237 else
6238 x += 10 + c - 'A';
6239 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006240 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006241 if (unicode_putchar(&v, &outpos, x) < 0)
6242 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006243 } else {
6244 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006245 if (unicode_decode_call_errorhandler(
6246 errors, &errorHandler,
6247 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006249 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006251 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 nextByte:
6253 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006255 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006257 Py_XDECREF(errorHandler);
6258 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006259 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006260
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006263 Py_XDECREF(errorHandler);
6264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 return NULL;
6266}
6267
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268
Alexander Belopolsky40018472011-02-26 01:02:56 +00006269PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006270PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006272 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 char *p;
6274 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006275 Py_ssize_t expandsize, pos;
6276 int kind;
6277 void *data;
6278 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006280 if (!PyUnicode_Check(unicode)) {
6281 PyErr_BadArgument();
6282 return NULL;
6283 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006284 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006285 return NULL;
6286 kind = PyUnicode_KIND(unicode);
6287 data = PyUnicode_DATA(unicode);
6288 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006289 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6290 bytes, and 1 byte characters 4. */
6291 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006292
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006293 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006295
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006296 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 if (repr == NULL)
6298 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006299 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006300 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006302 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006303 for (pos = 0; pos < len; pos++) {
6304 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 /* Map 32-bit characters to '\Uxxxxxxxx' */
6306 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006307 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006308 *p++ = '\\';
6309 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006310 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6311 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6312 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6313 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6314 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6315 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6316 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6317 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006318 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006320 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 *p++ = '\\';
6322 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006323 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6324 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6325 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6326 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 /* Copy everything else as-is */
6329 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 *p++ = (char) ch;
6331 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006332
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006333 assert(p > q);
6334 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006335 return NULL;
6336 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337}
6338
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006340PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6341 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006343 PyObject *result;
6344 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6345 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006346 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006347 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6348 Py_DECREF(tmp);
6349 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350}
6351
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006352/* --- Unicode Internal Codec ------------------------------------------- */
6353
Alexander Belopolsky40018472011-02-26 01:02:56 +00006354PyObject *
6355_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006356 Py_ssize_t size,
6357 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006358{
6359 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006360 Py_ssize_t startinpos;
6361 Py_ssize_t endinpos;
6362 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006363 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006364 const char *end;
6365 const char *reason;
6366 PyObject *errorHandler = NULL;
6367 PyObject *exc = NULL;
6368
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006369 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006370 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006371 1))
6372 return NULL;
6373
Thomas Wouters89f507f2006-12-13 04:49:30 +00006374 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006375 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006376 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006378 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006379 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006380 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006381 end = s + size;
6382
6383 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006384 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006385 Py_UCS4 ch;
6386 /* We copy the raw representation one byte at a time because the
6387 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006388 ((char *) &uch)[0] = s[0];
6389 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006390#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006391 ((char *) &uch)[2] = s[2];
6392 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006393#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006394 ch = uch;
6395
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006396 /* We have to sanity check the raw data, otherwise doom looms for
6397 some malformed UCS-4 data. */
6398 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006399#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006400 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006401#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006402 end-s < Py_UNICODE_SIZE
6403 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006405 startinpos = s - starts;
6406 if (end-s < Py_UNICODE_SIZE) {
6407 endinpos = end-starts;
6408 reason = "truncated input";
6409 }
6410 else {
6411 endinpos = s - starts + Py_UNICODE_SIZE;
6412 reason = "illegal code point (> 0x10FFFF)";
6413 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006414 if (unicode_decode_call_errorhandler(
6415 errors, &errorHandler,
6416 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006417 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006418 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006419 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006420 continue;
6421 }
6422
6423 s += Py_UNICODE_SIZE;
6424#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006425 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006426 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006427 Py_UNICODE uch2;
6428 ((char *) &uch2)[0] = s[0];
6429 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006430 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006431 {
Victor Stinner551ac952011-11-29 22:58:13 +01006432 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006433 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006434 }
6435 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006436#endif
6437
6438 if (unicode_putchar(&v, &outpos, ch) < 0)
6439 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006440 }
6441
Victor Stinner16e6a802011-12-12 13:24:15 +01006442 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006443 goto onError;
6444 Py_XDECREF(errorHandler);
6445 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006446 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006449 Py_XDECREF(v);
6450 Py_XDECREF(errorHandler);
6451 Py_XDECREF(exc);
6452 return NULL;
6453}
6454
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455/* --- Latin-1 Codec ------------------------------------------------------ */
6456
Alexander Belopolsky40018472011-02-26 01:02:56 +00006457PyObject *
6458PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006459 Py_ssize_t size,
6460 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006463 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464}
6465
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006467static void
6468make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006469 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006470 PyObject *unicode,
6471 Py_ssize_t startpos, Py_ssize_t endpos,
6472 const char *reason)
6473{
6474 if (*exceptionObject == NULL) {
6475 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006476 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006477 encoding, unicode, startpos, endpos, reason);
6478 }
6479 else {
6480 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6481 goto onError;
6482 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6483 goto onError;
6484 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6485 goto onError;
6486 return;
6487 onError:
6488 Py_DECREF(*exceptionObject);
6489 *exceptionObject = NULL;
6490 }
6491}
6492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006493/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006494static void
6495raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006496 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006497 PyObject *unicode,
6498 Py_ssize_t startpos, Py_ssize_t endpos,
6499 const char *reason)
6500{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006501 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006502 encoding, unicode, startpos, endpos, reason);
6503 if (*exceptionObject != NULL)
6504 PyCodec_StrictErrors(*exceptionObject);
6505}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506
6507/* error handling callback helper:
6508 build arguments, call the callback and check the arguments,
6509 put the result into newpos and return the replacement string, which
6510 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006511static PyObject *
6512unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006513 PyObject **errorHandler,
6514 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006515 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006516 Py_ssize_t startpos, Py_ssize_t endpos,
6517 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006518{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006519 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006521 PyObject *restuple;
6522 PyObject *resunicode;
6523
6524 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006528 }
6529
Benjamin Petersonbac79492012-01-14 13:34:47 -05006530 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006531 return NULL;
6532 len = PyUnicode_GET_LENGTH(unicode);
6533
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006534 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006538
6539 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006544 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 Py_DECREF(restuple);
6546 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006547 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006548 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 &resunicode, newpos)) {
6550 Py_DECREF(restuple);
6551 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006553 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6554 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6555 Py_DECREF(restuple);
6556 return NULL;
6557 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 *newpos = len + *newpos;
6560 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6562 Py_DECREF(restuple);
6563 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006564 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565 Py_INCREF(resunicode);
6566 Py_DECREF(restuple);
6567 return resunicode;
6568}
6569
Alexander Belopolsky40018472011-02-26 01:02:56 +00006570static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006571unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006572 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006573 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006575 /* input state */
6576 Py_ssize_t pos=0, size;
6577 int kind;
6578 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579 /* output object */
6580 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006581 /* pointer into the output */
6582 char *str;
6583 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006584 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006585 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6586 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006587 PyObject *errorHandler = NULL;
6588 PyObject *exc = NULL;
6589 /* the following variable is used for caching string comparisons
6590 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6591 int known_errorHandler = -1;
6592
Benjamin Petersonbac79492012-01-14 13:34:47 -05006593 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006594 return NULL;
6595 size = PyUnicode_GET_LENGTH(unicode);
6596 kind = PyUnicode_KIND(unicode);
6597 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006598 /* allocate enough for a simple encoding without
6599 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006600 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006601 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006602 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006604 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006605 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 ressize = size;
6607
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006608 while (pos < size) {
6609 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 /* can we encode this? */
6612 if (c<limit) {
6613 /* no overflow check, because we know that the space is enough */
6614 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006615 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006616 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 Py_ssize_t requiredsize;
6619 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006622 Py_ssize_t collstart = pos;
6623 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006625 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 ++collend;
6627 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6628 if (known_errorHandler==-1) {
6629 if ((errors==NULL) || (!strcmp(errors, "strict")))
6630 known_errorHandler = 1;
6631 else if (!strcmp(errors, "replace"))
6632 known_errorHandler = 2;
6633 else if (!strcmp(errors, "ignore"))
6634 known_errorHandler = 3;
6635 else if (!strcmp(errors, "xmlcharrefreplace"))
6636 known_errorHandler = 4;
6637 else
6638 known_errorHandler = 0;
6639 }
6640 switch (known_errorHandler) {
6641 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006642 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 goto onError;
6644 case 2: /* replace */
6645 while (collstart++<collend)
6646 *str++ = '?'; /* fall through */
6647 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006648 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 break;
6650 case 4: /* xmlcharrefreplace */
6651 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006652 /* determine replacement size */
6653 for (i = collstart, repsize = 0; i < collend; ++i) {
6654 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6655 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006665 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006667 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006668 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006670 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006672 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 if (requiredsize > ressize) {
6674 if (requiredsize<2*ressize)
6675 requiredsize = 2*ressize;
6676 if (_PyBytes_Resize(&res, requiredsize))
6677 goto onError;
6678 str = PyBytes_AS_STRING(res) + respos;
6679 ressize = requiredsize;
6680 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006681 /* generate replacement */
6682 for (i = collstart; i < collend; ++i) {
6683 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006685 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 break;
6687 default:
6688 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689 encoding, reason, unicode, &exc,
6690 collstart, collend, &newpos);
6691 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006692 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006694 if (PyBytes_Check(repunicode)) {
6695 /* Directly copy bytes result to output. */
6696 repsize = PyBytes_Size(repunicode);
6697 if (repsize > 1) {
6698 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006699 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006700 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6701 Py_DECREF(repunicode);
6702 goto onError;
6703 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006704 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006705 ressize += repsize-1;
6706 }
6707 memcpy(str, PyBytes_AsString(repunicode), repsize);
6708 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006710 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006711 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006712 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 /* need more space? (at least enough for what we
6714 have+the replacement+the rest of the string, so
6715 we won't have to check space for encodable characters) */
6716 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 repsize = PyUnicode_GET_LENGTH(repunicode);
6718 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 if (requiredsize > ressize) {
6720 if (requiredsize<2*ressize)
6721 requiredsize = 2*ressize;
6722 if (_PyBytes_Resize(&res, requiredsize)) {
6723 Py_DECREF(repunicode);
6724 goto onError;
6725 }
6726 str = PyBytes_AS_STRING(res) + respos;
6727 ressize = requiredsize;
6728 }
6729 /* check if there is anything unencodable in the replacement
6730 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731 for (i = 0; repsize-->0; ++i, ++str) {
6732 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006734 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 Py_DECREF(repunicode);
6737 goto onError;
6738 }
6739 *str = (char)c;
6740 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006741 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006744 }
6745 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006746 /* Resize if we allocated to much */
6747 size = str - PyBytes_AS_STRING(res);
6748 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006749 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006750 if (_PyBytes_Resize(&res, size) < 0)
6751 goto onError;
6752 }
6753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 Py_XDECREF(errorHandler);
6755 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006756 return res;
6757
6758 onError:
6759 Py_XDECREF(res);
6760 Py_XDECREF(errorHandler);
6761 Py_XDECREF(exc);
6762 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006763}
6764
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006765/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006766PyObject *
6767PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006768 Py_ssize_t size,
6769 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006771 PyObject *result;
6772 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6773 if (unicode == NULL)
6774 return NULL;
6775 result = unicode_encode_ucs1(unicode, errors, 256);
6776 Py_DECREF(unicode);
6777 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778}
6779
Alexander Belopolsky40018472011-02-26 01:02:56 +00006780PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006781_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782{
6783 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 PyErr_BadArgument();
6785 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006787 if (PyUnicode_READY(unicode) == -1)
6788 return NULL;
6789 /* Fast path: if it is a one-byte string, construct
6790 bytes object directly. */
6791 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6792 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6793 PyUnicode_GET_LENGTH(unicode));
6794 /* Non-Latin-1 characters present. Defer to above function to
6795 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006796 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006797}
6798
6799PyObject*
6800PyUnicode_AsLatin1String(PyObject *unicode)
6801{
6802 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803}
6804
6805/* --- 7-bit ASCII Codec -------------------------------------------------- */
6806
Alexander Belopolsky40018472011-02-26 01:02:56 +00006807PyObject *
6808PyUnicode_DecodeASCII(const char *s,
6809 Py_ssize_t size,
6810 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006812 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006813 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006814 int kind;
6815 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006816 Py_ssize_t startinpos;
6817 Py_ssize_t endinpos;
6818 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006819 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006820 int has_error;
6821 const unsigned char *p = (const unsigned char *)s;
6822 const unsigned char *end = p + size;
6823 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006824 PyObject *errorHandler = NULL;
6825 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006826
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006827 if (size == 0) {
6828 Py_INCREF(unicode_empty);
6829 return unicode_empty;
6830 }
6831
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006833 if (size == 1 && (unsigned char)s[0] < 128)
6834 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006835
Victor Stinner702c7342011-10-05 13:50:52 +02006836 has_error = 0;
6837 while (p < end && !has_error) {
6838 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6839 an explanation. */
6840 if (!((size_t) p & LONG_PTR_MASK)) {
6841 /* Help register allocation */
6842 register const unsigned char *_p = p;
6843 while (_p < aligned_end) {
6844 unsigned long value = *(unsigned long *) _p;
6845 if (value & ASCII_CHAR_MASK) {
6846 has_error = 1;
6847 break;
6848 }
6849 _p += SIZEOF_LONG;
6850 }
6851 if (_p == end)
6852 break;
6853 if (has_error)
6854 break;
6855 p = _p;
6856 }
6857 if (*p & 0x80) {
6858 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006859 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006860 }
6861 else {
6862 ++p;
6863 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006864 }
Victor Stinner702c7342011-10-05 13:50:52 +02006865 if (!has_error)
6866 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006868 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006872 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006873 kind = PyUnicode_KIND(v);
6874 data = PyUnicode_DATA(v);
6875 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 e = s + size;
6877 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 register unsigned char c = (unsigned char)*s;
6879 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006880 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 ++s;
6882 }
6883 else {
6884 startinpos = s-starts;
6885 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 if (unicode_decode_call_errorhandler(
6887 errors, &errorHandler,
6888 "ascii", "ordinal not in range(128)",
6889 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006890 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006892 kind = PyUnicode_KIND(v);
6893 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006896 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006897 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 Py_XDECREF(errorHandler);
6899 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006900 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006901 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006902
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 Py_XDECREF(errorHandler);
6906 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 return NULL;
6908}
6909
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006910/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006911PyObject *
6912PyUnicode_EncodeASCII(const Py_UNICODE *p,
6913 Py_ssize_t size,
6914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006916 PyObject *result;
6917 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6918 if (unicode == NULL)
6919 return NULL;
6920 result = unicode_encode_ucs1(unicode, errors, 128);
6921 Py_DECREF(unicode);
6922 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923}
6924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006926_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927{
6928 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 PyErr_BadArgument();
6930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006932 if (PyUnicode_READY(unicode) == -1)
6933 return NULL;
6934 /* Fast path: if it is an ASCII-only string, construct bytes object
6935 directly. Else defer to above function to raise the exception. */
6936 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6937 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6938 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006939 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006940}
6941
6942PyObject *
6943PyUnicode_AsASCIIString(PyObject *unicode)
6944{
6945 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946}
6947
Victor Stinner99b95382011-07-04 14:23:54 +02006948#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006949
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006950/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006951
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006952#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953#define NEED_RETRY
6954#endif
6955
Victor Stinner3a50e702011-10-18 21:21:00 +02006956#ifndef WC_ERR_INVALID_CHARS
6957# define WC_ERR_INVALID_CHARS 0x0080
6958#endif
6959
6960static char*
6961code_page_name(UINT code_page, PyObject **obj)
6962{
6963 *obj = NULL;
6964 if (code_page == CP_ACP)
6965 return "mbcs";
6966 if (code_page == CP_UTF7)
6967 return "CP_UTF7";
6968 if (code_page == CP_UTF8)
6969 return "CP_UTF8";
6970
6971 *obj = PyBytes_FromFormat("cp%u", code_page);
6972 if (*obj == NULL)
6973 return NULL;
6974 return PyBytes_AS_STRING(*obj);
6975}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976
Alexander Belopolsky40018472011-02-26 01:02:56 +00006977static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006978is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979{
6980 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982
Victor Stinner3a50e702011-10-18 21:21:00 +02006983 if (!IsDBCSLeadByteEx(code_page, *curr))
6984 return 0;
6985
6986 prev = CharPrevExA(code_page, s, curr, 0);
6987 if (prev == curr)
6988 return 1;
6989 /* FIXME: This code is limited to "true" double-byte encodings,
6990 as it assumes an incomplete character consists of a single
6991 byte. */
6992 if (curr - prev == 2)
6993 return 1;
6994 if (!IsDBCSLeadByteEx(code_page, *prev))
6995 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996 return 0;
6997}
6998
Victor Stinner3a50e702011-10-18 21:21:00 +02006999static DWORD
7000decode_code_page_flags(UINT code_page)
7001{
7002 if (code_page == CP_UTF7) {
7003 /* The CP_UTF7 decoder only supports flags=0 */
7004 return 0;
7005 }
7006 else
7007 return MB_ERR_INVALID_CHARS;
7008}
7009
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007010/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 * Decode a byte string from a Windows code page into unicode object in strict
7012 * mode.
7013 *
7014 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7015 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007016 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007017static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007018decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007019 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007020 const char *in,
7021 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022{
Victor Stinner3a50e702011-10-18 21:21:00 +02007023 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007024 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026
7027 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 assert(insize > 0);
7029 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7030 if (outsize <= 0)
7031 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
7033 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007035 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007036 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 if (*v == NULL)
7038 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007039 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040 }
7041 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007044 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007046 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047 }
7048
7049 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007050 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7051 if (outsize <= 0)
7052 goto error;
7053 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007054
Victor Stinner3a50e702011-10-18 21:21:00 +02007055error:
7056 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7057 return -2;
7058 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007059 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007060}
7061
Victor Stinner3a50e702011-10-18 21:21:00 +02007062/*
7063 * Decode a byte string from a code page into unicode object with an error
7064 * handler.
7065 *
7066 * Returns consumed size if succeed, or raise a WindowsError or
7067 * UnicodeDecodeError exception and returns -1 on error.
7068 */
7069static int
7070decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007071 PyObject **v,
7072 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 const char *errors)
7074{
7075 const char *startin = in;
7076 const char *endin = in + size;
7077 const DWORD flags = decode_code_page_flags(code_page);
7078 /* Ideally, we should get reason from FormatMessage. This is the Windows
7079 2000 English version of the message. */
7080 const char *reason = "No mapping for the Unicode character exists "
7081 "in the target code page.";
7082 /* each step cannot decode more than 1 character, but a character can be
7083 represented as a surrogate pair */
7084 wchar_t buffer[2], *startout, *out;
7085 int insize, outsize;
7086 PyObject *errorHandler = NULL;
7087 PyObject *exc = NULL;
7088 PyObject *encoding_obj = NULL;
7089 char *encoding;
7090 DWORD err;
7091 int ret = -1;
7092
7093 assert(size > 0);
7094
7095 encoding = code_page_name(code_page, &encoding_obj);
7096 if (encoding == NULL)
7097 return -1;
7098
7099 if (errors == NULL || strcmp(errors, "strict") == 0) {
7100 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7101 UnicodeDecodeError. */
7102 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7103 if (exc != NULL) {
7104 PyCodec_StrictErrors(exc);
7105 Py_CLEAR(exc);
7106 }
7107 goto error;
7108 }
7109
7110 if (*v == NULL) {
7111 /* Create unicode object */
7112 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7113 PyErr_NoMemory();
7114 goto error;
7115 }
Victor Stinnerab595942011-12-17 04:59:06 +01007116 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007117 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 if (*v == NULL)
7119 goto error;
7120 startout = PyUnicode_AS_UNICODE(*v);
7121 }
7122 else {
7123 /* Extend unicode object */
7124 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7125 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7126 PyErr_NoMemory();
7127 goto error;
7128 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007129 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007130 goto error;
7131 startout = PyUnicode_AS_UNICODE(*v) + n;
7132 }
7133
7134 /* Decode the byte string character per character */
7135 out = startout;
7136 while (in < endin)
7137 {
7138 /* Decode a character */
7139 insize = 1;
7140 do
7141 {
7142 outsize = MultiByteToWideChar(code_page, flags,
7143 in, insize,
7144 buffer, Py_ARRAY_LENGTH(buffer));
7145 if (outsize > 0)
7146 break;
7147 err = GetLastError();
7148 if (err != ERROR_NO_UNICODE_TRANSLATION
7149 && err != ERROR_INSUFFICIENT_BUFFER)
7150 {
7151 PyErr_SetFromWindowsErr(0);
7152 goto error;
7153 }
7154 insize++;
7155 }
7156 /* 4=maximum length of a UTF-8 sequence */
7157 while (insize <= 4 && (in + insize) <= endin);
7158
7159 if (outsize <= 0) {
7160 Py_ssize_t startinpos, endinpos, outpos;
7161
7162 startinpos = in - startin;
7163 endinpos = startinpos + 1;
7164 outpos = out - PyUnicode_AS_UNICODE(*v);
7165 if (unicode_decode_call_errorhandler(
7166 errors, &errorHandler,
7167 encoding, reason,
7168 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007169 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 {
7171 goto error;
7172 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007173 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 }
7175 else {
7176 in += insize;
7177 memcpy(out, buffer, outsize * sizeof(wchar_t));
7178 out += outsize;
7179 }
7180 }
7181
7182 /* write a NUL character at the end */
7183 *out = 0;
7184
7185 /* Extend unicode object */
7186 outsize = out - startout;
7187 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007188 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007190 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007191
7192error:
7193 Py_XDECREF(encoding_obj);
7194 Py_XDECREF(errorHandler);
7195 Py_XDECREF(exc);
7196 return ret;
7197}
7198
Victor Stinner3a50e702011-10-18 21:21:00 +02007199static PyObject *
7200decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007201 const char *s, Py_ssize_t size,
7202 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007203{
Victor Stinner76a31a62011-11-04 00:05:13 +01007204 PyObject *v = NULL;
7205 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007206
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 if (code_page < 0) {
7208 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7209 return NULL;
7210 }
7211
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007212 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 do
7216 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007218 if (size > INT_MAX) {
7219 chunk_size = INT_MAX;
7220 final = 0;
7221 done = 0;
7222 }
7223 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007224#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007225 {
7226 chunk_size = (int)size;
7227 final = (consumed == NULL);
7228 done = 1;
7229 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007230
Victor Stinner76a31a62011-11-04 00:05:13 +01007231 /* Skip trailing lead-byte unless 'final' is set */
7232 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7233 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007234
Victor Stinner76a31a62011-11-04 00:05:13 +01007235 if (chunk_size == 0 && done) {
7236 if (v != NULL)
7237 break;
7238 Py_INCREF(unicode_empty);
7239 return unicode_empty;
7240 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007241
Victor Stinner76a31a62011-11-04 00:05:13 +01007242
7243 converted = decode_code_page_strict(code_page, &v,
7244 s, chunk_size);
7245 if (converted == -2)
7246 converted = decode_code_page_errors(code_page, &v,
7247 s, chunk_size,
7248 errors);
7249 assert(converted != 0);
7250
7251 if (converted < 0) {
7252 Py_XDECREF(v);
7253 return NULL;
7254 }
7255
7256 if (consumed)
7257 *consumed += converted;
7258
7259 s += converted;
7260 size -= converted;
7261 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007262
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007263 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007264}
7265
Alexander Belopolsky40018472011-02-26 01:02:56 +00007266PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007267PyUnicode_DecodeCodePageStateful(int code_page,
7268 const char *s,
7269 Py_ssize_t size,
7270 const char *errors,
7271 Py_ssize_t *consumed)
7272{
7273 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7274}
7275
7276PyObject *
7277PyUnicode_DecodeMBCSStateful(const char *s,
7278 Py_ssize_t size,
7279 const char *errors,
7280 Py_ssize_t *consumed)
7281{
7282 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7283}
7284
7285PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007286PyUnicode_DecodeMBCS(const char *s,
7287 Py_ssize_t size,
7288 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007289{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7291}
7292
Victor Stinner3a50e702011-10-18 21:21:00 +02007293static DWORD
7294encode_code_page_flags(UINT code_page, const char *errors)
7295{
7296 if (code_page == CP_UTF8) {
7297 if (winver.dwMajorVersion >= 6)
7298 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7299 and later */
7300 return WC_ERR_INVALID_CHARS;
7301 else
7302 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7303 return 0;
7304 }
7305 else if (code_page == CP_UTF7) {
7306 /* CP_UTF7 only supports flags=0 */
7307 return 0;
7308 }
7309 else {
7310 if (errors != NULL && strcmp(errors, "replace") == 0)
7311 return 0;
7312 else
7313 return WC_NO_BEST_FIT_CHARS;
7314 }
7315}
7316
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 * Encode a Unicode string to a Windows code page into a byte string in strict
7319 * mode.
7320 *
7321 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7322 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007324static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007325encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007326 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328{
Victor Stinner554f3f02010-06-16 23:33:54 +00007329 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 BOOL *pusedDefaultChar = &usedDefaultChar;
7331 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007332 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007333 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007334 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 const DWORD flags = encode_code_page_flags(code_page, NULL);
7336 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007337 /* Create a substring so that we can get the UTF-16 representation
7338 of just the slice under consideration. */
7339 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340
Martin v. Löwis3d325192011-11-04 18:23:06 +01007341 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007342
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007344 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007346 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007347
Victor Stinner2fc507f2011-11-04 20:06:39 +01007348 substring = PyUnicode_Substring(unicode, offset, offset+len);
7349 if (substring == NULL)
7350 return -1;
7351 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7352 if (p == NULL) {
7353 Py_DECREF(substring);
7354 return -1;
7355 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007356
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007357 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 outsize = WideCharToMultiByte(code_page, flags,
7359 p, size,
7360 NULL, 0,
7361 NULL, pusedDefaultChar);
7362 if (outsize <= 0)
7363 goto error;
7364 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007365 if (pusedDefaultChar && *pusedDefaultChar) {
7366 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007368 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007369
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007372 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007373 if (*outbytes == NULL) {
7374 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007376 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007378 }
7379 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 const Py_ssize_t n = PyBytes_Size(*outbytes);
7382 if (outsize > PY_SSIZE_T_MAX - n) {
7383 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007384 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007387 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7388 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007390 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007392 }
7393
7394 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007395 outsize = WideCharToMultiByte(code_page, flags,
7396 p, size,
7397 out, outsize,
7398 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007399 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 if (outsize <= 0)
7401 goto error;
7402 if (pusedDefaultChar && *pusedDefaultChar)
7403 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007404 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007405
Victor Stinner3a50e702011-10-18 21:21:00 +02007406error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007407 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7409 return -2;
7410 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007411 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007412}
7413
Victor Stinner3a50e702011-10-18 21:21:00 +02007414/*
7415 * Encode a Unicode string to a Windows code page into a byte string using a
7416 * error handler.
7417 *
7418 * Returns consumed characters if succeed, or raise a WindowsError and returns
7419 * -1 on other error.
7420 */
7421static int
7422encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007423 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007424 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007425{
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007427 Py_ssize_t pos = unicode_offset;
7428 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 /* Ideally, we should get reason from FormatMessage. This is the Windows
7430 2000 English version of the message. */
7431 const char *reason = "invalid character";
7432 /* 4=maximum length of a UTF-8 sequence */
7433 char buffer[4];
7434 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7435 Py_ssize_t outsize;
7436 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 PyObject *errorHandler = NULL;
7438 PyObject *exc = NULL;
7439 PyObject *encoding_obj = NULL;
7440 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007441 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 PyObject *rep;
7443 int ret = -1;
7444
7445 assert(insize > 0);
7446
7447 encoding = code_page_name(code_page, &encoding_obj);
7448 if (encoding == NULL)
7449 return -1;
7450
7451 if (errors == NULL || strcmp(errors, "strict") == 0) {
7452 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7453 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007454 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 if (exc != NULL) {
7456 PyCodec_StrictErrors(exc);
7457 Py_DECREF(exc);
7458 }
7459 Py_XDECREF(encoding_obj);
7460 return -1;
7461 }
7462
7463 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7464 pusedDefaultChar = &usedDefaultChar;
7465 else
7466 pusedDefaultChar = NULL;
7467
7468 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7469 PyErr_NoMemory();
7470 goto error;
7471 }
7472 outsize = insize * Py_ARRAY_LENGTH(buffer);
7473
7474 if (*outbytes == NULL) {
7475 /* Create string object */
7476 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7477 if (*outbytes == NULL)
7478 goto error;
7479 out = PyBytes_AS_STRING(*outbytes);
7480 }
7481 else {
7482 /* Extend string object */
7483 Py_ssize_t n = PyBytes_Size(*outbytes);
7484 if (n > PY_SSIZE_T_MAX - outsize) {
7485 PyErr_NoMemory();
7486 goto error;
7487 }
7488 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7489 goto error;
7490 out = PyBytes_AS_STRING(*outbytes) + n;
7491 }
7492
7493 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007494 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007496 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7497 wchar_t chars[2];
7498 int charsize;
7499 if (ch < 0x10000) {
7500 chars[0] = (wchar_t)ch;
7501 charsize = 1;
7502 }
7503 else {
7504 ch -= 0x10000;
7505 chars[0] = 0xd800 + (ch >> 10);
7506 chars[1] = 0xdc00 + (ch & 0x3ff);
7507 charsize = 2;
7508 }
7509
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007511 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 buffer, Py_ARRAY_LENGTH(buffer),
7513 NULL, pusedDefaultChar);
7514 if (outsize > 0) {
7515 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7516 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 memcpy(out, buffer, outsize);
7519 out += outsize;
7520 continue;
7521 }
7522 }
7523 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7524 PyErr_SetFromWindowsErr(0);
7525 goto error;
7526 }
7527
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 rep = unicode_encode_call_errorhandler(
7529 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007530 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 if (rep == NULL)
7533 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007534 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007535
7536 if (PyBytes_Check(rep)) {
7537 outsize = PyBytes_GET_SIZE(rep);
7538 if (outsize != 1) {
7539 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7540 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7541 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7542 Py_DECREF(rep);
7543 goto error;
7544 }
7545 out = PyBytes_AS_STRING(*outbytes) + offset;
7546 }
7547 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7548 out += outsize;
7549 }
7550 else {
7551 Py_ssize_t i;
7552 enum PyUnicode_Kind kind;
7553 void *data;
7554
Benjamin Petersonbac79492012-01-14 13:34:47 -05007555 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007556 Py_DECREF(rep);
7557 goto error;
7558 }
7559
7560 outsize = PyUnicode_GET_LENGTH(rep);
7561 if (outsize != 1) {
7562 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7563 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7564 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7565 Py_DECREF(rep);
7566 goto error;
7567 }
7568 out = PyBytes_AS_STRING(*outbytes) + offset;
7569 }
7570 kind = PyUnicode_KIND(rep);
7571 data = PyUnicode_DATA(rep);
7572 for (i=0; i < outsize; i++) {
7573 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7574 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007575 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007576 encoding, unicode,
7577 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 "unable to encode error handler result to ASCII");
7579 Py_DECREF(rep);
7580 goto error;
7581 }
7582 *out = (unsigned char)ch;
7583 out++;
7584 }
7585 }
7586 Py_DECREF(rep);
7587 }
7588 /* write a NUL byte */
7589 *out = 0;
7590 outsize = out - PyBytes_AS_STRING(*outbytes);
7591 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7592 if (_PyBytes_Resize(outbytes, outsize) < 0)
7593 goto error;
7594 ret = 0;
7595
7596error:
7597 Py_XDECREF(encoding_obj);
7598 Py_XDECREF(errorHandler);
7599 Py_XDECREF(exc);
7600 return ret;
7601}
7602
Victor Stinner3a50e702011-10-18 21:21:00 +02007603static PyObject *
7604encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007605 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 const char *errors)
7607{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007608 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007610 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007611 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007612
Benjamin Petersonbac79492012-01-14 13:34:47 -05007613 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007614 return NULL;
7615 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007616
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 if (code_page < 0) {
7618 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7619 return NULL;
7620 }
7621
Martin v. Löwis3d325192011-11-04 18:23:06 +01007622 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007623 return PyBytes_FromStringAndSize(NULL, 0);
7624
Victor Stinner7581cef2011-11-03 22:32:33 +01007625 offset = 0;
7626 do
7627 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007628#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007629 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007630 chunks. */
7631 if (len > INT_MAX/2) {
7632 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007633 done = 0;
7634 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007635 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007637 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007638 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007639 done = 1;
7640 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007641
Victor Stinner76a31a62011-11-04 00:05:13 +01007642 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007643 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007644 errors);
7645 if (ret == -2)
7646 ret = encode_code_page_errors(code_page, &outbytes,
7647 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007648 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007649 if (ret < 0) {
7650 Py_XDECREF(outbytes);
7651 return NULL;
7652 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007653
Victor Stinner7581cef2011-11-03 22:32:33 +01007654 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007655 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007656 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657
Victor Stinner3a50e702011-10-18 21:21:00 +02007658 return outbytes;
7659}
7660
7661PyObject *
7662PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7663 Py_ssize_t size,
7664 const char *errors)
7665{
Victor Stinner7581cef2011-11-03 22:32:33 +01007666 PyObject *unicode, *res;
7667 unicode = PyUnicode_FromUnicode(p, size);
7668 if (unicode == NULL)
7669 return NULL;
7670 res = encode_code_page(CP_ACP, unicode, errors);
7671 Py_DECREF(unicode);
7672 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007673}
7674
7675PyObject *
7676PyUnicode_EncodeCodePage(int code_page,
7677 PyObject *unicode,
7678 const char *errors)
7679{
Victor Stinner7581cef2011-11-03 22:32:33 +01007680 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007681}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007682
Alexander Belopolsky40018472011-02-26 01:02:56 +00007683PyObject *
7684PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007685{
7686 if (!PyUnicode_Check(unicode)) {
7687 PyErr_BadArgument();
7688 return NULL;
7689 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007690 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007691}
7692
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007693#undef NEED_RETRY
7694
Victor Stinner99b95382011-07-04 14:23:54 +02007695#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007696
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697/* --- Character Mapping Codec -------------------------------------------- */
7698
Alexander Belopolsky40018472011-02-26 01:02:56 +00007699PyObject *
7700PyUnicode_DecodeCharmap(const char *s,
7701 Py_ssize_t size,
7702 PyObject *mapping,
7703 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007706 Py_ssize_t startinpos;
7707 Py_ssize_t endinpos;
7708 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007710 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007711 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007712 PyObject *errorHandler = NULL;
7713 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007714
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 /* Default to Latin-1 */
7716 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007719 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007723 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007724 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007726 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007727 Py_ssize_t maplen;
7728 enum PyUnicode_Kind kind;
7729 void *data;
7730 Py_UCS4 x;
7731
Benjamin Petersonbac79492012-01-14 13:34:47 -05007732 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007733 return NULL;
7734
7735 maplen = PyUnicode_GET_LENGTH(mapping);
7736 data = PyUnicode_DATA(mapping);
7737 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 while (s < e) {
7739 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007742 x = PyUnicode_READ(kind, data, ch);
7743 else
7744 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007746 if (x == 0xfffe)
7747 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 startinpos = s-starts;
7750 endinpos = startinpos+1;
7751 if (unicode_decode_call_errorhandler(
7752 errors, &errorHandler,
7753 "charmap", "character maps to <undefined>",
7754 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007755 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 goto onError;
7757 }
7758 continue;
7759 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007760
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007761 if (unicode_putchar(&v, &outpos, x) < 0)
7762 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007764 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007765 }
7766 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 while (s < e) {
7768 unsigned char ch = *s;
7769 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007770
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7772 w = PyLong_FromLong((long)ch);
7773 if (w == NULL)
7774 goto onError;
7775 x = PyObject_GetItem(mapping, w);
7776 Py_DECREF(w);
7777 if (x == NULL) {
7778 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7779 /* No mapping found means: mapping is undefined. */
7780 PyErr_Clear();
7781 x = Py_None;
7782 Py_INCREF(x);
7783 } else
7784 goto onError;
7785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007786
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 /* Apply mapping */
7788 if (PyLong_Check(x)) {
7789 long value = PyLong_AS_LONG(x);
7790 if (value < 0 || value > 65535) {
7791 PyErr_SetString(PyExc_TypeError,
7792 "character mapping must be in range(65536)");
7793 Py_DECREF(x);
7794 goto onError;
7795 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007796 if (unicode_putchar(&v, &outpos, value) < 0)
7797 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 }
7799 else if (x == Py_None) {
7800 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 startinpos = s-starts;
7802 endinpos = startinpos+1;
7803 if (unicode_decode_call_errorhandler(
7804 errors, &errorHandler,
7805 "charmap", "character maps to <undefined>",
7806 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007807 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 Py_DECREF(x);
7809 goto onError;
7810 }
7811 Py_DECREF(x);
7812 continue;
7813 }
7814 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007815 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007816
Benjamin Petersonbac79492012-01-14 13:34:47 -05007817 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007818 goto onError;
7819 targetsize = PyUnicode_GET_LENGTH(x);
7820
7821 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007823 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007824 PyUnicode_READ_CHAR(x, 0)) < 0)
7825 goto onError;
7826 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 else if (targetsize > 1) {
7828 /* 1-n mapping */
7829 if (targetsize > extrachars) {
7830 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 Py_ssize_t needed = (targetsize - extrachars) + \
7832 (targetsize << 2);
7833 extrachars += needed;
7834 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007835 if (unicode_resize(&v,
7836 PyUnicode_GET_LENGTH(v) + needed) < 0)
7837 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 Py_DECREF(x);
7839 goto onError;
7840 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007842 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7843 goto onError;
7844 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7845 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 extrachars -= targetsize;
7847 }
7848 /* 1-0 mapping: skip the character */
7849 }
7850 else {
7851 /* wrong return value */
7852 PyErr_SetString(PyExc_TypeError,
7853 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 Py_DECREF(x);
7855 goto onError;
7856 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 Py_DECREF(x);
7858 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007861 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007862 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863 Py_XDECREF(errorHandler);
7864 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007865 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007866
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868 Py_XDECREF(errorHandler);
7869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 Py_XDECREF(v);
7871 return NULL;
7872}
7873
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007874/* Charmap encoding: the lookup table */
7875
Alexander Belopolsky40018472011-02-26 01:02:56 +00007876struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 PyObject_HEAD
7878 unsigned char level1[32];
7879 int count2, count3;
7880 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881};
7882
7883static PyObject*
7884encoding_map_size(PyObject *obj, PyObject* args)
7885{
7886 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007887 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889}
7890
7891static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007892 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 PyDoc_STR("Return the size (in bytes) of this object") },
7894 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895};
7896
7897static void
7898encoding_map_dealloc(PyObject* o)
7899{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007900 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901}
7902
7903static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007904 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 "EncodingMap", /*tp_name*/
7906 sizeof(struct encoding_map), /*tp_basicsize*/
7907 0, /*tp_itemsize*/
7908 /* methods */
7909 encoding_map_dealloc, /*tp_dealloc*/
7910 0, /*tp_print*/
7911 0, /*tp_getattr*/
7912 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007913 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 0, /*tp_repr*/
7915 0, /*tp_as_number*/
7916 0, /*tp_as_sequence*/
7917 0, /*tp_as_mapping*/
7918 0, /*tp_hash*/
7919 0, /*tp_call*/
7920 0, /*tp_str*/
7921 0, /*tp_getattro*/
7922 0, /*tp_setattro*/
7923 0, /*tp_as_buffer*/
7924 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7925 0, /*tp_doc*/
7926 0, /*tp_traverse*/
7927 0, /*tp_clear*/
7928 0, /*tp_richcompare*/
7929 0, /*tp_weaklistoffset*/
7930 0, /*tp_iter*/
7931 0, /*tp_iternext*/
7932 encoding_map_methods, /*tp_methods*/
7933 0, /*tp_members*/
7934 0, /*tp_getset*/
7935 0, /*tp_base*/
7936 0, /*tp_dict*/
7937 0, /*tp_descr_get*/
7938 0, /*tp_descr_set*/
7939 0, /*tp_dictoffset*/
7940 0, /*tp_init*/
7941 0, /*tp_alloc*/
7942 0, /*tp_new*/
7943 0, /*tp_free*/
7944 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945};
7946
7947PyObject*
7948PyUnicode_BuildEncodingMap(PyObject* string)
7949{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007950 PyObject *result;
7951 struct encoding_map *mresult;
7952 int i;
7953 int need_dict = 0;
7954 unsigned char level1[32];
7955 unsigned char level2[512];
7956 unsigned char *mlevel1, *mlevel2, *mlevel3;
7957 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007958 int kind;
7959 void *data;
7960 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007963 PyErr_BadArgument();
7964 return NULL;
7965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007966 kind = PyUnicode_KIND(string);
7967 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 memset(level1, 0xFF, sizeof level1);
7969 memset(level2, 0xFF, sizeof level2);
7970
7971 /* If there isn't a one-to-one mapping of NULL to \0,
7972 or if there are non-BMP characters, we need to use
7973 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007974 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007975 need_dict = 1;
7976 for (i = 1; i < 256; i++) {
7977 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007978 ch = PyUnicode_READ(kind, data, i);
7979 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007980 need_dict = 1;
7981 break;
7982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007984 /* unmapped character */
7985 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007986 l1 = ch >> 11;
7987 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007988 if (level1[l1] == 0xFF)
7989 level1[l1] = count2++;
7990 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007991 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007992 }
7993
7994 if (count2 >= 0xFF || count3 >= 0xFF)
7995 need_dict = 1;
7996
7997 if (need_dict) {
7998 PyObject *result = PyDict_New();
7999 PyObject *key, *value;
8000 if (!result)
8001 return NULL;
8002 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008003 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008004 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008005 if (!key || !value)
8006 goto failed1;
8007 if (PyDict_SetItem(result, key, value) == -1)
8008 goto failed1;
8009 Py_DECREF(key);
8010 Py_DECREF(value);
8011 }
8012 return result;
8013 failed1:
8014 Py_XDECREF(key);
8015 Py_XDECREF(value);
8016 Py_DECREF(result);
8017 return NULL;
8018 }
8019
8020 /* Create a three-level trie */
8021 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8022 16*count2 + 128*count3 - 1);
8023 if (!result)
8024 return PyErr_NoMemory();
8025 PyObject_Init(result, &EncodingMapType);
8026 mresult = (struct encoding_map*)result;
8027 mresult->count2 = count2;
8028 mresult->count3 = count3;
8029 mlevel1 = mresult->level1;
8030 mlevel2 = mresult->level23;
8031 mlevel3 = mresult->level23 + 16*count2;
8032 memcpy(mlevel1, level1, 32);
8033 memset(mlevel2, 0xFF, 16*count2);
8034 memset(mlevel3, 0, 128*count3);
8035 count3 = 0;
8036 for (i = 1; i < 256; i++) {
8037 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039 /* unmapped character */
8040 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008041 o1 = PyUnicode_READ(kind, data, i)>>11;
8042 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043 i2 = 16*mlevel1[o1] + o2;
8044 if (mlevel2[i2] == 0xFF)
8045 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008046 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047 i3 = 128*mlevel2[i2] + o3;
8048 mlevel3[i3] = i;
8049 }
8050 return result;
8051}
8052
8053static int
Victor Stinner22168992011-11-20 17:09:18 +01008054encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055{
8056 struct encoding_map *map = (struct encoding_map*)mapping;
8057 int l1 = c>>11;
8058 int l2 = (c>>7) & 0xF;
8059 int l3 = c & 0x7F;
8060 int i;
8061
Victor Stinner22168992011-11-20 17:09:18 +01008062 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064 if (c == 0)
8065 return 0;
8066 /* level 1*/
8067 i = map->level1[l1];
8068 if (i == 0xFF) {
8069 return -1;
8070 }
8071 /* level 2*/
8072 i = map->level23[16*i+l2];
8073 if (i == 0xFF) {
8074 return -1;
8075 }
8076 /* level 3 */
8077 i = map->level23[16*map->count2 + 128*i + l3];
8078 if (i == 0) {
8079 return -1;
8080 }
8081 return i;
8082}
8083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084/* Lookup the character ch in the mapping. If the character
8085 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008086 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008087static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008088charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089{
Christian Heimes217cfd12007-12-02 14:31:20 +00008090 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091 PyObject *x;
8092
8093 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095 x = PyObject_GetItem(mapping, w);
8096 Py_DECREF(w);
8097 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8099 /* No mapping found means: mapping is undefined. */
8100 PyErr_Clear();
8101 x = Py_None;
8102 Py_INCREF(x);
8103 return x;
8104 } else
8105 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008107 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008109 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 long value = PyLong_AS_LONG(x);
8111 if (value < 0 || value > 255) {
8112 PyErr_SetString(PyExc_TypeError,
8113 "character mapping must be in range(256)");
8114 Py_DECREF(x);
8115 return NULL;
8116 }
8117 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008119 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 /* wrong return value */
8123 PyErr_Format(PyExc_TypeError,
8124 "character mapping must return integer, bytes or None, not %.400s",
8125 x->ob_type->tp_name);
8126 Py_DECREF(x);
8127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 }
8129}
8130
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008132charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008134 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8135 /* exponentially overallocate to minimize reallocations */
8136 if (requiredsize < 2*outsize)
8137 requiredsize = 2*outsize;
8138 if (_PyBytes_Resize(outobj, requiredsize))
8139 return -1;
8140 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141}
8142
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008145} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008147 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 space is available. Return a new reference to the object that
8149 was put in the output buffer, or Py_None, if the mapping was undefined
8150 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008151 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008152static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008153charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008154 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008155{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 PyObject *rep;
8157 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008158 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159
Christian Heimes90aa7642007-12-19 02:45:37 +00008160 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 if (res == -1)
8164 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 if (outsize<requiredsize)
8166 if (charmapencode_resize(outobj, outpos, requiredsize))
8167 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008168 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 outstart[(*outpos)++] = (char)res;
8170 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008171 }
8172
8173 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008174 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 Py_DECREF(rep);
8178 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008179 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 if (PyLong_Check(rep)) {
8181 Py_ssize_t requiredsize = *outpos+1;
8182 if (outsize<requiredsize)
8183 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8184 Py_DECREF(rep);
8185 return enc_EXCEPTION;
8186 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008187 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008189 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 else {
8191 const char *repchars = PyBytes_AS_STRING(rep);
8192 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8193 Py_ssize_t requiredsize = *outpos+repsize;
8194 if (outsize<requiredsize)
8195 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8196 Py_DECREF(rep);
8197 return enc_EXCEPTION;
8198 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008199 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 memcpy(outstart + *outpos, repchars, repsize);
8201 *outpos += repsize;
8202 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 Py_DECREF(rep);
8205 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008206}
8207
8208/* handle an error in PyUnicode_EncodeCharmap
8209 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008210static int
8211charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008212 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008214 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008215 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216{
8217 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008218 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008219 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008220 enum PyUnicode_Kind kind;
8221 void *data;
8222 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008224 Py_ssize_t collstartpos = *inpos;
8225 Py_ssize_t collendpos = *inpos+1;
8226 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227 char *encoding = "charmap";
8228 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008229 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008230 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008231 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232
Benjamin Petersonbac79492012-01-14 13:34:47 -05008233 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008234 return -1;
8235 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 /* find all unencodable characters */
8237 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008239 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008240 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008241 val = encoding_map_lookup(ch, mapping);
8242 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 break;
8244 ++collendpos;
8245 continue;
8246 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008247
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008248 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8249 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 if (rep==NULL)
8251 return -1;
8252 else if (rep!=Py_None) {
8253 Py_DECREF(rep);
8254 break;
8255 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008256 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 }
8259 /* cache callback name lookup
8260 * (if not done yet, i.e. it's the first error) */
8261 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 if ((errors==NULL) || (!strcmp(errors, "strict")))
8263 *known_errorHandler = 1;
8264 else if (!strcmp(errors, "replace"))
8265 *known_errorHandler = 2;
8266 else if (!strcmp(errors, "ignore"))
8267 *known_errorHandler = 3;
8268 else if (!strcmp(errors, "xmlcharrefreplace"))
8269 *known_errorHandler = 4;
8270 else
8271 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 }
8273 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008274 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008275 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008276 return -1;
8277 case 2: /* replace */
8278 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 x = charmapencode_output('?', mapping, res, respos);
8280 if (x==enc_EXCEPTION) {
8281 return -1;
8282 }
8283 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008284 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 return -1;
8286 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008287 }
8288 /* fall through */
8289 case 3: /* ignore */
8290 *inpos = collendpos;
8291 break;
8292 case 4: /* xmlcharrefreplace */
8293 /* generate replacement (temporarily (mis)uses p) */
8294 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 char buffer[2+29+1+1];
8296 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008297 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 for (cp = buffer; *cp; ++cp) {
8299 x = charmapencode_output(*cp, mapping, res, respos);
8300 if (x==enc_EXCEPTION)
8301 return -1;
8302 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008303 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 return -1;
8305 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 }
8307 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008308 *inpos = collendpos;
8309 break;
8310 default:
8311 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008312 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008314 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008316 if (PyBytes_Check(repunicode)) {
8317 /* Directly copy bytes result to output. */
8318 Py_ssize_t outsize = PyBytes_Size(*res);
8319 Py_ssize_t requiredsize;
8320 repsize = PyBytes_Size(repunicode);
8321 requiredsize = *respos + repsize;
8322 if (requiredsize > outsize)
8323 /* Make room for all additional bytes. */
8324 if (charmapencode_resize(res, respos, requiredsize)) {
8325 Py_DECREF(repunicode);
8326 return -1;
8327 }
8328 memcpy(PyBytes_AsString(*res) + *respos,
8329 PyBytes_AsString(repunicode), repsize);
8330 *respos += repsize;
8331 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008332 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008333 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008335 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008336 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008337 Py_DECREF(repunicode);
8338 return -1;
8339 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008340 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008341 data = PyUnicode_DATA(repunicode);
8342 kind = PyUnicode_KIND(repunicode);
8343 for (index = 0; index < repsize; index++) {
8344 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8345 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008347 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 return -1;
8349 }
8350 else if (x==enc_FAILED) {
8351 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008352 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 return -1;
8354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355 }
8356 *inpos = newpos;
8357 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358 }
8359 return 0;
8360}
8361
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008363_PyUnicode_EncodeCharmap(PyObject *unicode,
8364 PyObject *mapping,
8365 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 /* output object */
8368 PyObject *res = NULL;
8369 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008370 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008371 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 PyObject *errorHandler = NULL;
8375 PyObject *exc = NULL;
8376 /* the following variable is used for caching string comparisons
8377 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8378 * 3=ignore, 4=xmlcharrefreplace */
8379 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380
Benjamin Petersonbac79492012-01-14 13:34:47 -05008381 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008382 return NULL;
8383 size = PyUnicode_GET_LENGTH(unicode);
8384
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 /* Default to Latin-1 */
8386 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008387 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 /* allocate enough for a simple encoding without
8390 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008391 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 if (res == NULL)
8393 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008394 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008398 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008400 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 if (x==enc_EXCEPTION) /* error */
8402 goto onError;
8403 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008404 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 &exc,
8406 &known_errorHandler, &errorHandler, errors,
8407 &res, &respos)) {
8408 goto onError;
8409 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 else
8412 /* done with this character => adjust input position */
8413 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008417 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008418 if (_PyBytes_Resize(&res, respos) < 0)
8419 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 Py_XDECREF(exc);
8422 Py_XDECREF(errorHandler);
8423 return res;
8424
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 Py_XDECREF(res);
8427 Py_XDECREF(exc);
8428 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 return NULL;
8430}
8431
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008432/* Deprecated */
8433PyObject *
8434PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8435 Py_ssize_t size,
8436 PyObject *mapping,
8437 const char *errors)
8438{
8439 PyObject *result;
8440 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8441 if (unicode == NULL)
8442 return NULL;
8443 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8444 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008445 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008446}
8447
Alexander Belopolsky40018472011-02-26 01:02:56 +00008448PyObject *
8449PyUnicode_AsCharmapString(PyObject *unicode,
8450 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451{
8452 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 PyErr_BadArgument();
8454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008456 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457}
8458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008460static void
8461make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008463 Py_ssize_t startpos, Py_ssize_t endpos,
8464 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 *exceptionObject = _PyUnicodeTranslateError_Create(
8468 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469 }
8470 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8472 goto onError;
8473 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8474 goto onError;
8475 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8476 goto onError;
8477 return;
8478 onError:
8479 Py_DECREF(*exceptionObject);
8480 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481 }
8482}
8483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008485static void
8486raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008488 Py_ssize_t startpos, Py_ssize_t endpos,
8489 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490{
8491 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495}
8496
8497/* error handling callback helper:
8498 build arguments, call the callback and check the arguments,
8499 put the result into newpos and return the replacement string, which
8500 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008501static PyObject *
8502unicode_translate_call_errorhandler(const char *errors,
8503 PyObject **errorHandler,
8504 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008506 Py_ssize_t startpos, Py_ssize_t endpos,
8507 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008509 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008511 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512 PyObject *restuple;
8513 PyObject *resunicode;
8514
8515 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 }
8520
8521 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525
8526 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008531 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 Py_DECREF(restuple);
8533 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 }
8535 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 &resunicode, &i_newpos)) {
8537 Py_DECREF(restuple);
8538 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008542 else
8543 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8546 Py_DECREF(restuple);
8547 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008548 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 Py_INCREF(resunicode);
8550 Py_DECREF(restuple);
8551 return resunicode;
8552}
8553
8554/* Lookup the character ch in the mapping and put the result in result,
8555 which must be decrefed by the caller.
8556 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008557static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559{
Christian Heimes217cfd12007-12-02 14:31:20 +00008560 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 PyObject *x;
8562
8563 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 x = PyObject_GetItem(mapping, w);
8566 Py_DECREF(w);
8567 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8569 /* No mapping found means: use 1:1 mapping. */
8570 PyErr_Clear();
8571 *result = NULL;
8572 return 0;
8573 } else
8574 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 }
8576 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 *result = x;
8578 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008579 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008580 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 long value = PyLong_AS_LONG(x);
8582 long max = PyUnicode_GetMax();
8583 if (value < 0 || value > max) {
8584 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008585 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 Py_DECREF(x);
8587 return -1;
8588 }
8589 *result = x;
8590 return 0;
8591 }
8592 else if (PyUnicode_Check(x)) {
8593 *result = x;
8594 return 0;
8595 }
8596 else {
8597 /* wrong return value */
8598 PyErr_SetString(PyExc_TypeError,
8599 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008600 Py_DECREF(x);
8601 return -1;
8602 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603}
8604/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 if not reallocate and adjust various state variables.
8606 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008607static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008612 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 /* exponentially overallocate to minimize reallocations */
8614 if (requiredsize < 2 * oldsize)
8615 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8617 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 }
8621 return 0;
8622}
8623/* lookup the character, put the result in the output string and adjust
8624 various state variables. Return a new reference to the object that
8625 was put in the output buffer in *result, or Py_None, if the mapping was
8626 undefined (in which case no character was written).
8627 The called must decref result.
8628 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8631 PyObject *mapping, Py_UCS4 **output,
8632 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008633 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8636 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641 }
8642 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008644 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 }
8648 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 Py_ssize_t repsize;
8650 if (PyUnicode_READY(*res) == -1)
8651 return -1;
8652 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 if (repsize==1) {
8654 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 }
8657 else if (repsize!=0) {
8658 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 Py_ssize_t requiredsize = *opos +
8660 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 Py_ssize_t i;
8663 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 for(i = 0; i < repsize; i++)
8666 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 }
8669 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 return 0;
8672}
8673
Alexander Belopolsky40018472011-02-26 01:02:56 +00008674PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675_PyUnicode_TranslateCharmap(PyObject *input,
8676 PyObject *mapping,
8677 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 /* input object */
8680 char *idata;
8681 Py_ssize_t size, i;
8682 int kind;
8683 /* output buffer */
8684 Py_UCS4 *output = NULL;
8685 Py_ssize_t osize;
8686 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 char *reason = "character maps to <undefined>";
8690 PyObject *errorHandler = NULL;
8691 PyObject *exc = NULL;
8692 /* the following variable is used for caching string comparisons
8693 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8694 * 3=ignore, 4=xmlcharrefreplace */
8695 int known_errorHandler = -1;
8696
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 PyErr_BadArgument();
8699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702 if (PyUnicode_READY(input) == -1)
8703 return NULL;
8704 idata = (char*)PyUnicode_DATA(input);
8705 kind = PyUnicode_KIND(input);
8706 size = PyUnicode_GET_LENGTH(input);
8707 i = 0;
8708
8709 if (size == 0) {
8710 Py_INCREF(input);
8711 return input;
8712 }
8713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 /* allocate enough for a simple 1:1 translation without
8715 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716 osize = size;
8717 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8718 opos = 0;
8719 if (output == NULL) {
8720 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 /* try to encode it */
8726 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 if (charmaptranslate_output(input, i, mapping,
8728 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 Py_XDECREF(x);
8730 goto onError;
8731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008732 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 else { /* untranslatable character */
8736 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8737 Py_ssize_t repsize;
8738 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741 Py_ssize_t collstart = i;
8742 Py_ssize_t collend = i+1;
8743 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 while (collend < size) {
8747 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 goto onError;
8749 Py_XDECREF(x);
8750 if (x!=Py_None)
8751 break;
8752 ++collend;
8753 }
8754 /* cache callback name lookup
8755 * (if not done yet, i.e. it's the first error) */
8756 if (known_errorHandler==-1) {
8757 if ((errors==NULL) || (!strcmp(errors, "strict")))
8758 known_errorHandler = 1;
8759 else if (!strcmp(errors, "replace"))
8760 known_errorHandler = 2;
8761 else if (!strcmp(errors, "ignore"))
8762 known_errorHandler = 3;
8763 else if (!strcmp(errors, "xmlcharrefreplace"))
8764 known_errorHandler = 4;
8765 else
8766 known_errorHandler = 0;
8767 }
8768 switch (known_errorHandler) {
8769 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 raise_translate_exception(&exc, input, collstart,
8771 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008772 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 case 2: /* replace */
8774 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 for (coll = collstart; coll<collend; coll++)
8776 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 /* fall through */
8778 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 break;
8781 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 /* generate replacement (temporarily (mis)uses i) */
8783 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 char buffer[2+29+1+1];
8785 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8787 if (charmaptranslate_makespace(&output, &osize,
8788 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 goto onError;
8790 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 break;
8795 default:
8796 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 reason, input, &exc,
8798 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008799 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008801 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008802 Py_DECREF(repunicode);
8803 goto onError;
8804 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 repsize = PyUnicode_GET_LENGTH(repunicode);
8807 if (charmaptranslate_makespace(&output, &osize,
8808 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 Py_DECREF(repunicode);
8810 goto onError;
8811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 for (uni2 = 0; repsize-->0; ++uni2)
8813 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8814 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008816 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008817 }
8818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8820 if (!res)
8821 goto onError;
8822 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008823 Py_XDECREF(exc);
8824 Py_XDECREF(errorHandler);
8825 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008829 Py_XDECREF(exc);
8830 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 return NULL;
8832}
8833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834/* Deprecated. Use PyUnicode_Translate instead. */
8835PyObject *
8836PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8837 Py_ssize_t size,
8838 PyObject *mapping,
8839 const char *errors)
8840{
8841 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8842 if (!unicode)
8843 return NULL;
8844 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8845}
8846
Alexander Belopolsky40018472011-02-26 01:02:56 +00008847PyObject *
8848PyUnicode_Translate(PyObject *str,
8849 PyObject *mapping,
8850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851{
8852 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008853
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 str = PyUnicode_FromObject(str);
8855 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 Py_DECREF(str);
8859 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008860
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 Py_XDECREF(str);
8863 return NULL;
8864}
Tim Petersced69f82003-09-16 20:30:58 +00008865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008867fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868{
8869 /* No need to call PyUnicode_READY(self) because this function is only
8870 called as a callback from fixup() which does it already. */
8871 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8872 const int kind = PyUnicode_KIND(self);
8873 void *data = PyUnicode_DATA(self);
8874 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008875 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 Py_ssize_t i;
8877
8878 for (i = 0; i < len; ++i) {
8879 ch = PyUnicode_READ(kind, data, i);
8880 fixed = 0;
8881 if (ch > 127) {
8882 if (Py_UNICODE_ISSPACE(ch))
8883 fixed = ' ';
8884 else {
8885 const int decimal = Py_UNICODE_TODECIMAL(ch);
8886 if (decimal >= 0)
8887 fixed = '0' + decimal;
8888 }
8889 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008890 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 if (fixed > maxchar)
8892 maxchar = fixed;
8893 PyUnicode_WRITE(kind, data, i, fixed);
8894 }
8895 else if (ch > maxchar)
8896 maxchar = ch;
8897 }
8898 else if (ch > maxchar)
8899 maxchar = ch;
8900 }
8901
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008902 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903}
8904
8905PyObject *
8906_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8907{
8908 if (!PyUnicode_Check(unicode)) {
8909 PyErr_BadInternalCall();
8910 return NULL;
8911 }
8912 if (PyUnicode_READY(unicode) == -1)
8913 return NULL;
8914 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8915 /* If the string is already ASCII, just return the same string */
8916 Py_INCREF(unicode);
8917 return unicode;
8918 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008919 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920}
8921
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008922PyObject *
8923PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8924 Py_ssize_t length)
8925{
Victor Stinnerf0124502011-11-21 23:12:56 +01008926 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008927 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008928 Py_UCS4 maxchar;
8929 enum PyUnicode_Kind kind;
8930 void *data;
8931
8932 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008933 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008934 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008935 if (ch > 127) {
8936 int decimal = Py_UNICODE_TODECIMAL(ch);
8937 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008938 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008939 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008940 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008941 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008942
8943 /* Copy to a new string */
8944 decimal = PyUnicode_New(length, maxchar);
8945 if (decimal == NULL)
8946 return decimal;
8947 kind = PyUnicode_KIND(decimal);
8948 data = PyUnicode_DATA(decimal);
8949 /* Iterate over code points */
8950 for (i = 0; i < length; i++) {
8951 Py_UNICODE ch = s[i];
8952 if (ch > 127) {
8953 int decimal = Py_UNICODE_TODECIMAL(ch);
8954 if (decimal >= 0)
8955 ch = '0' + decimal;
8956 }
8957 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008959 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008960}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008961/* --- Decimal Encoder ---------------------------------------------------- */
8962
Alexander Belopolsky40018472011-02-26 01:02:56 +00008963int
8964PyUnicode_EncodeDecimal(Py_UNICODE *s,
8965 Py_ssize_t length,
8966 char *output,
8967 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008968{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008969 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008970 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008971 enum PyUnicode_Kind kind;
8972 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008973
8974 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 PyErr_BadArgument();
8976 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008977 }
8978
Victor Stinner42bf7752011-11-21 22:52:58 +01008979 unicode = PyUnicode_FromUnicode(s, length);
8980 if (unicode == NULL)
8981 return -1;
8982
Benjamin Petersonbac79492012-01-14 13:34:47 -05008983 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008984 Py_DECREF(unicode);
8985 return -1;
8986 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008987 kind = PyUnicode_KIND(unicode);
8988 data = PyUnicode_DATA(unicode);
8989
Victor Stinnerb84d7232011-11-22 01:50:07 +01008990 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008991 PyObject *exc;
8992 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008994 Py_ssize_t startpos;
8995
8996 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008997
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008999 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009000 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009002 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 decimal = Py_UNICODE_TODECIMAL(ch);
9004 if (decimal >= 0) {
9005 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009006 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 continue;
9008 }
9009 if (0 < ch && ch < 256) {
9010 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009011 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 continue;
9013 }
Victor Stinner6345be92011-11-25 20:09:01 +01009014
Victor Stinner42bf7752011-11-21 22:52:58 +01009015 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009016 exc = NULL;
9017 raise_encode_exception(&exc, "decimal", unicode,
9018 startpos, startpos+1,
9019 "invalid decimal Unicode string");
9020 Py_XDECREF(exc);
9021 Py_DECREF(unicode);
9022 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009023 }
9024 /* 0-terminate the output string */
9025 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009026 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009027 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009028}
9029
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030/* --- Helpers ------------------------------------------------------------ */
9031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009033any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 Py_ssize_t start,
9035 Py_ssize_t end)
9036{
9037 int kind1, kind2, kind;
9038 void *buf1, *buf2;
9039 Py_ssize_t len1, len2, result;
9040
9041 kind1 = PyUnicode_KIND(s1);
9042 kind2 = PyUnicode_KIND(s2);
9043 kind = kind1 > kind2 ? kind1 : kind2;
9044 buf1 = PyUnicode_DATA(s1);
9045 buf2 = PyUnicode_DATA(s2);
9046 if (kind1 != kind)
9047 buf1 = _PyUnicode_AsKind(s1, kind);
9048 if (!buf1)
9049 return -2;
9050 if (kind2 != kind)
9051 buf2 = _PyUnicode_AsKind(s2, kind);
9052 if (!buf2) {
9053 if (kind1 != kind) PyMem_Free(buf1);
9054 return -2;
9055 }
9056 len1 = PyUnicode_GET_LENGTH(s1);
9057 len2 = PyUnicode_GET_LENGTH(s2);
9058
Victor Stinner794d5672011-10-10 03:21:36 +02009059 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009060 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009061 case PyUnicode_1BYTE_KIND:
9062 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9063 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9064 else
9065 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9066 break;
9067 case PyUnicode_2BYTE_KIND:
9068 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9069 break;
9070 case PyUnicode_4BYTE_KIND:
9071 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9072 break;
9073 default:
9074 assert(0); result = -2;
9075 }
9076 }
9077 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009078 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009079 case PyUnicode_1BYTE_KIND:
9080 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9081 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9082 else
9083 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9084 break;
9085 case PyUnicode_2BYTE_KIND:
9086 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9087 break;
9088 case PyUnicode_4BYTE_KIND:
9089 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9090 break;
9091 default:
9092 assert(0); result = -2;
9093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 }
9095
9096 if (kind1 != kind)
9097 PyMem_Free(buf1);
9098 if (kind2 != kind)
9099 PyMem_Free(buf2);
9100
9101 return result;
9102}
9103
9104Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009105_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106 Py_ssize_t n_buffer,
9107 void *digits, Py_ssize_t n_digits,
9108 Py_ssize_t min_width,
9109 const char *grouping,
9110 const char *thousands_sep)
9111{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009112 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009114 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9115 return _PyUnicode_ascii_InsertThousandsGrouping(
9116 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9117 min_width, grouping, thousands_sep);
9118 else
9119 return _PyUnicode_ucs1_InsertThousandsGrouping(
9120 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9121 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 case PyUnicode_2BYTE_KIND:
9123 return _PyUnicode_ucs2_InsertThousandsGrouping(
9124 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9125 min_width, grouping, thousands_sep);
9126 case PyUnicode_4BYTE_KIND:
9127 return _PyUnicode_ucs4_InsertThousandsGrouping(
9128 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9129 min_width, grouping, thousands_sep);
9130 }
9131 assert(0);
9132 return -1;
9133}
9134
9135
Thomas Wouters477c8d52006-05-27 19:21:47 +00009136/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009137#define ADJUST_INDICES(start, end, len) \
9138 if (end > len) \
9139 end = len; \
9140 else if (end < 0) { \
9141 end += len; \
9142 if (end < 0) \
9143 end = 0; \
9144 } \
9145 if (start < 0) { \
9146 start += len; \
9147 if (start < 0) \
9148 start = 0; \
9149 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009150
Alexander Belopolsky40018472011-02-26 01:02:56 +00009151Py_ssize_t
9152PyUnicode_Count(PyObject *str,
9153 PyObject *substr,
9154 Py_ssize_t start,
9155 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009157 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009158 PyObject* str_obj;
9159 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 int kind1, kind2, kind;
9161 void *buf1 = NULL, *buf2 = NULL;
9162 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009163
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009164 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009165 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009167 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009168 if (!sub_obj) {
9169 Py_DECREF(str_obj);
9170 return -1;
9171 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009172 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009173 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 Py_DECREF(str_obj);
9175 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176 }
Tim Petersced69f82003-09-16 20:30:58 +00009177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 kind1 = PyUnicode_KIND(str_obj);
9179 kind2 = PyUnicode_KIND(sub_obj);
9180 kind = kind1 > kind2 ? kind1 : kind2;
9181 buf1 = PyUnicode_DATA(str_obj);
9182 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009183 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 if (!buf1)
9185 goto onError;
9186 buf2 = PyUnicode_DATA(sub_obj);
9187 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009188 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (!buf2)
9190 goto onError;
9191 len1 = PyUnicode_GET_LENGTH(str_obj);
9192 len2 = PyUnicode_GET_LENGTH(sub_obj);
9193
9194 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009195 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009197 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9198 result = asciilib_count(
9199 ((Py_UCS1*)buf1) + start, end - start,
9200 buf2, len2, PY_SSIZE_T_MAX
9201 );
9202 else
9203 result = ucs1lib_count(
9204 ((Py_UCS1*)buf1) + start, end - start,
9205 buf2, len2, PY_SSIZE_T_MAX
9206 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 break;
9208 case PyUnicode_2BYTE_KIND:
9209 result = ucs2lib_count(
9210 ((Py_UCS2*)buf1) + start, end - start,
9211 buf2, len2, PY_SSIZE_T_MAX
9212 );
9213 break;
9214 case PyUnicode_4BYTE_KIND:
9215 result = ucs4lib_count(
9216 ((Py_UCS4*)buf1) + start, end - start,
9217 buf2, len2, PY_SSIZE_T_MAX
9218 );
9219 break;
9220 default:
9221 assert(0); result = 0;
9222 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009223
9224 Py_DECREF(sub_obj);
9225 Py_DECREF(str_obj);
9226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 if (kind1 != kind)
9228 PyMem_Free(buf1);
9229 if (kind2 != kind)
9230 PyMem_Free(buf2);
9231
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 onError:
9234 Py_DECREF(sub_obj);
9235 Py_DECREF(str_obj);
9236 if (kind1 != kind && buf1)
9237 PyMem_Free(buf1);
9238 if (kind2 != kind && buf2)
9239 PyMem_Free(buf2);
9240 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241}
9242
Alexander Belopolsky40018472011-02-26 01:02:56 +00009243Py_ssize_t
9244PyUnicode_Find(PyObject *str,
9245 PyObject *sub,
9246 Py_ssize_t start,
9247 Py_ssize_t end,
9248 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009250 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009251
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009253 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009255 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009256 if (!sub) {
9257 Py_DECREF(str);
9258 return -2;
9259 }
9260 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9261 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009262 Py_DECREF(str);
9263 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 }
Tim Petersced69f82003-09-16 20:30:58 +00009265
Victor Stinner794d5672011-10-10 03:21:36 +02009266 result = any_find_slice(direction,
9267 str, sub, start, end
9268 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009269
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009271 Py_DECREF(sub);
9272
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273 return result;
9274}
9275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276Py_ssize_t
9277PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9278 Py_ssize_t start, Py_ssize_t end,
9279 int direction)
9280{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009282 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 if (PyUnicode_READY(str) == -1)
9284 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009285 if (start < 0 || end < 0) {
9286 PyErr_SetString(PyExc_IndexError, "string index out of range");
9287 return -2;
9288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 if (end > PyUnicode_GET_LENGTH(str))
9290 end = PyUnicode_GET_LENGTH(str);
9291 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009292 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9293 kind, end-start, ch, direction);
9294 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009296 else
9297 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298}
9299
Alexander Belopolsky40018472011-02-26 01:02:56 +00009300static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009301tailmatch(PyObject *self,
9302 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009303 Py_ssize_t start,
9304 Py_ssize_t end,
9305 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 int kind_self;
9308 int kind_sub;
9309 void *data_self;
9310 void *data_sub;
9311 Py_ssize_t offset;
9312 Py_ssize_t i;
9313 Py_ssize_t end_sub;
9314
9315 if (PyUnicode_READY(self) == -1 ||
9316 PyUnicode_READY(substring) == -1)
9317 return 0;
9318
9319 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320 return 1;
9321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9323 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 kind_self = PyUnicode_KIND(self);
9328 data_self = PyUnicode_DATA(self);
9329 kind_sub = PyUnicode_KIND(substring);
9330 data_sub = PyUnicode_DATA(substring);
9331 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9332
9333 if (direction > 0)
9334 offset = end;
9335 else
9336 offset = start;
9337
9338 if (PyUnicode_READ(kind_self, data_self, offset) ==
9339 PyUnicode_READ(kind_sub, data_sub, 0) &&
9340 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9341 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9342 /* If both are of the same kind, memcmp is sufficient */
9343 if (kind_self == kind_sub) {
9344 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009345 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 data_sub,
9347 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009348 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 }
9350 /* otherwise we have to compare each character by first accesing it */
9351 else {
9352 /* We do not need to compare 0 and len(substring)-1 because
9353 the if statement above ensured already that they are equal
9354 when we end up here. */
9355 // TODO: honor direction and do a forward or backwards search
9356 for (i = 1; i < end_sub; ++i) {
9357 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9358 PyUnicode_READ(kind_sub, data_sub, i))
9359 return 0;
9360 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363 }
9364
9365 return 0;
9366}
9367
Alexander Belopolsky40018472011-02-26 01:02:56 +00009368Py_ssize_t
9369PyUnicode_Tailmatch(PyObject *str,
9370 PyObject *substr,
9371 Py_ssize_t start,
9372 Py_ssize_t end,
9373 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009375 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009376
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 str = PyUnicode_FromObject(str);
9378 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380 substr = PyUnicode_FromObject(substr);
9381 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 Py_DECREF(str);
9383 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384 }
Tim Petersced69f82003-09-16 20:30:58 +00009385
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009386 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 Py_DECREF(str);
9389 Py_DECREF(substr);
9390 return result;
9391}
9392
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393/* Apply fixfct filter to the Unicode object self and return a
9394 reference to the modified object */
9395
Alexander Belopolsky40018472011-02-26 01:02:56 +00009396static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009397fixup(PyObject *self,
9398 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 PyObject *u;
9401 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009402 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009404 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009407 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 /* fix functions return the new maximum character in a string,
9410 if the kind of the resulting unicode object does not change,
9411 everything is fine. Otherwise we need to change the string kind
9412 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009413 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009414
9415 if (maxchar_new == 0) {
9416 /* no changes */;
9417 if (PyUnicode_CheckExact(self)) {
9418 Py_DECREF(u);
9419 Py_INCREF(self);
9420 return self;
9421 }
9422 else
9423 return u;
9424 }
9425
9426 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 maxchar_new = 127;
9428 else if (maxchar_new <= 255)
9429 maxchar_new = 255;
9430 else if (maxchar_new <= 65535)
9431 maxchar_new = 65535;
9432 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009433 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434
Victor Stinnereaab6042011-12-11 22:22:39 +01009435 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009437
9438 /* In case the maximum character changed, we need to
9439 convert the string to the new category. */
9440 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9441 if (v == NULL) {
9442 Py_DECREF(u);
9443 return NULL;
9444 }
9445 if (maxchar_new > maxchar_old) {
9446 /* If the maxchar increased so that the kind changed, not all
9447 characters are representable anymore and we need to fix the
9448 string again. This only happens in very few cases. */
9449 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9450 maxchar_old = fixfct(v);
9451 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 }
9453 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009454 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009456 Py_DECREF(u);
9457 assert(_PyUnicode_CheckConsistency(v, 1));
9458 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459}
9460
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009461static PyObject *
9462ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009464 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9465 char *resdata, *data = PyUnicode_DATA(self);
9466 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009467
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009468 res = PyUnicode_New(len, 127);
9469 if (res == NULL)
9470 return NULL;
9471 resdata = PyUnicode_DATA(res);
9472 if (lower)
9473 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009475 _Py_bytes_upper(resdata, data, len);
9476 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477}
9478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009482 Py_ssize_t j;
9483 int final_sigma;
9484 Py_UCS4 c;
9485 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009486
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009487 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9488
9489 where ! is a negation and \p{xxx} is a character with property xxx.
9490 */
9491 for (j = i - 1; j >= 0; j--) {
9492 c = PyUnicode_READ(kind, data, j);
9493 if (!_PyUnicode_IsCaseIgnorable(c))
9494 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009496 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9497 if (final_sigma) {
9498 for (j = i + 1; j < length; j++) {
9499 c = PyUnicode_READ(kind, data, j);
9500 if (!_PyUnicode_IsCaseIgnorable(c))
9501 break;
9502 }
9503 final_sigma = j == length || !_PyUnicode_IsCased(c);
9504 }
9505 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506}
9507
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508static int
9509lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9510 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009512 /* Obscure special case. */
9513 if (c == 0x3A3) {
9514 mapped[0] = handle_capital_sigma(kind, data, length, i);
9515 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009517 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518}
9519
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009520static Py_ssize_t
9521do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009523 Py_ssize_t i, k = 0;
9524 int n_res, j;
9525 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009526
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009527 c = PyUnicode_READ(kind, data, 0);
9528 n_res = _PyUnicode_ToUpperFull(c, mapped);
9529 for (j = 0; j < n_res; j++) {
9530 if (mapped[j] > *maxchar)
9531 *maxchar = mapped[j];
9532 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009534 for (i = 1; i < length; i++) {
9535 c = PyUnicode_READ(kind, data, i);
9536 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9537 for (j = 0; j < n_res; j++) {
9538 if (mapped[j] > *maxchar)
9539 *maxchar = mapped[j];
9540 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009541 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009542 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009543 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544}
9545
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009546static Py_ssize_t
9547do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9548 Py_ssize_t i, k = 0;
9549
9550 for (i = 0; i < length; i++) {
9551 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9552 int n_res, j;
9553 if (Py_UNICODE_ISUPPER(c)) {
9554 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9555 }
9556 else if (Py_UNICODE_ISLOWER(c)) {
9557 n_res = _PyUnicode_ToUpperFull(c, mapped);
9558 }
9559 else {
9560 n_res = 1;
9561 mapped[0] = c;
9562 }
9563 for (j = 0; j < n_res; j++) {
9564 if (mapped[j] > *maxchar)
9565 *maxchar = mapped[j];
9566 res[k++] = mapped[j];
9567 }
9568 }
9569 return k;
9570}
9571
9572static Py_ssize_t
9573do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9574 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009576 Py_ssize_t i, k = 0;
9577
9578 for (i = 0; i < length; i++) {
9579 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9580 int n_res, j;
9581 if (lower)
9582 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9583 else
9584 n_res = _PyUnicode_ToUpperFull(c, mapped);
9585 for (j = 0; j < n_res; j++) {
9586 if (mapped[j] > *maxchar)
9587 *maxchar = mapped[j];
9588 res[k++] = mapped[j];
9589 }
9590 }
9591 return k;
9592}
9593
9594static Py_ssize_t
9595do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9596{
9597 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9598}
9599
9600static Py_ssize_t
9601do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9602{
9603 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9604}
9605
Benjamin Petersone51757f2012-01-12 21:10:29 -05009606static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009607do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9608{
9609 Py_ssize_t i, k = 0;
9610
9611 for (i = 0; i < length; i++) {
9612 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9613 Py_UCS4 mapped[3];
9614 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9615 for (j = 0; j < n_res; j++) {
9616 if (mapped[j] > *maxchar)
9617 *maxchar = mapped[j];
9618 res[k++] = mapped[j];
9619 }
9620 }
9621 return k;
9622}
9623
9624static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009625do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9626{
9627 Py_ssize_t i, k = 0;
9628 int previous_is_cased;
9629
9630 previous_is_cased = 0;
9631 for (i = 0; i < length; i++) {
9632 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9633 Py_UCS4 mapped[3];
9634 int n_res, j;
9635
9636 if (previous_is_cased)
9637 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9638 else
9639 n_res = _PyUnicode_ToTitleFull(c, mapped);
9640
9641 for (j = 0; j < n_res; j++) {
9642 if (mapped[j] > *maxchar)
9643 *maxchar = mapped[j];
9644 res[k++] = mapped[j];
9645 }
9646
9647 previous_is_cased = _PyUnicode_IsCased(c);
9648 }
9649 return k;
9650}
9651
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652static PyObject *
9653case_operation(PyObject *self,
9654 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9655{
9656 PyObject *res = NULL;
9657 Py_ssize_t length, newlength = 0;
9658 int kind, outkind;
9659 void *data, *outdata;
9660 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9661
Benjamin Petersoneea48462012-01-16 14:28:50 -05009662 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663
9664 kind = PyUnicode_KIND(self);
9665 data = PyUnicode_DATA(self);
9666 length = PyUnicode_GET_LENGTH(self);
9667 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9668 if (tmp == NULL)
9669 return PyErr_NoMemory();
9670 newlength = perform(kind, data, length, tmp, &maxchar);
9671 res = PyUnicode_New(newlength, maxchar);
9672 if (res == NULL)
9673 goto leave;
9674 tmpend = tmp + newlength;
9675 outdata = PyUnicode_DATA(res);
9676 outkind = PyUnicode_KIND(res);
9677 switch (outkind) {
9678 case PyUnicode_1BYTE_KIND:
9679 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9680 break;
9681 case PyUnicode_2BYTE_KIND:
9682 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9683 break;
9684 case PyUnicode_4BYTE_KIND:
9685 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9686 break;
9687 default:
9688 assert(0);
9689 break;
9690 }
9691 leave:
9692 PyMem_FREE(tmp);
9693 return res;
9694}
9695
Tim Peters8ce9f162004-08-27 01:49:32 +00009696PyObject *
9697PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009700 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009702 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009703 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9704 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009705 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009707 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009709 int use_memcpy;
9710 unsigned char *res_data = NULL, *sep_data = NULL;
9711 PyObject *last_obj;
9712 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713
Tim Peters05eba1f2004-08-27 21:32:02 +00009714 fseq = PySequence_Fast(seq, "");
9715 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009717 }
9718
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009719 /* NOTE: the following code can't call back into Python code,
9720 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009721 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009722
Tim Peters05eba1f2004-08-27 21:32:02 +00009723 seqlen = PySequence_Fast_GET_SIZE(fseq);
9724 /* If empty sequence, return u"". */
9725 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009726 Py_DECREF(fseq);
9727 Py_INCREF(unicode_empty);
9728 res = unicode_empty;
9729 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009730 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009731
Tim Peters05eba1f2004-08-27 21:32:02 +00009732 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009733 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009734 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009735 if (seqlen == 1) {
9736 if (PyUnicode_CheckExact(items[0])) {
9737 res = items[0];
9738 Py_INCREF(res);
9739 Py_DECREF(fseq);
9740 return res;
9741 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009742 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009743 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009744 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009745 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009746 /* Set up sep and seplen */
9747 if (separator == NULL) {
9748 /* fall back to a blank space separator */
9749 sep = PyUnicode_FromOrdinal(' ');
9750 if (!sep)
9751 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009752 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009753 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009754 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009755 else {
9756 if (!PyUnicode_Check(separator)) {
9757 PyErr_Format(PyExc_TypeError,
9758 "separator: expected str instance,"
9759 " %.80s found",
9760 Py_TYPE(separator)->tp_name);
9761 goto onError;
9762 }
9763 if (PyUnicode_READY(separator))
9764 goto onError;
9765 sep = separator;
9766 seplen = PyUnicode_GET_LENGTH(separator);
9767 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9768 /* inc refcount to keep this code path symmetric with the
9769 above case of a blank separator */
9770 Py_INCREF(sep);
9771 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009772 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009773 }
9774
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009775 /* There are at least two things to join, or else we have a subclass
9776 * of str in the sequence.
9777 * Do a pre-pass to figure out the total amount of space we'll
9778 * need (sz), and see whether all argument are strings.
9779 */
9780 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009781#ifdef Py_DEBUG
9782 use_memcpy = 0;
9783#else
9784 use_memcpy = 1;
9785#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009786 for (i = 0; i < seqlen; i++) {
9787 const Py_ssize_t old_sz = sz;
9788 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009789 if (!PyUnicode_Check(item)) {
9790 PyErr_Format(PyExc_TypeError,
9791 "sequence item %zd: expected str instance,"
9792 " %.80s found",
9793 i, Py_TYPE(item)->tp_name);
9794 goto onError;
9795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 if (PyUnicode_READY(item) == -1)
9797 goto onError;
9798 sz += PyUnicode_GET_LENGTH(item);
9799 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009800 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009801 if (i != 0)
9802 sz += seplen;
9803 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9804 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009805 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009806 goto onError;
9807 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009808 if (use_memcpy && last_obj != NULL) {
9809 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9810 use_memcpy = 0;
9811 }
9812 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009813 }
Tim Petersced69f82003-09-16 20:30:58 +00009814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009816 if (res == NULL)
9817 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009818
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009819 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009820#ifdef Py_DEBUG
9821 use_memcpy = 0;
9822#else
9823 if (use_memcpy) {
9824 res_data = PyUnicode_1BYTE_DATA(res);
9825 kind = PyUnicode_KIND(res);
9826 if (seplen != 0)
9827 sep_data = PyUnicode_1BYTE_DATA(sep);
9828 }
9829#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009831 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009832 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009834 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009835 if (use_memcpy) {
9836 Py_MEMCPY(res_data,
9837 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009838 kind * seplen);
9839 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009840 }
9841 else {
9842 copy_characters(res, res_offset, sep, 0, seplen);
9843 res_offset += seplen;
9844 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009846 itemlen = PyUnicode_GET_LENGTH(item);
9847 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009848 if (use_memcpy) {
9849 Py_MEMCPY(res_data,
9850 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009851 kind * itemlen);
9852 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009853 }
9854 else {
9855 copy_characters(res, res_offset, item, 0, itemlen);
9856 res_offset += itemlen;
9857 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009858 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009859 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009860 if (use_memcpy)
9861 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009862 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009863 else
9864 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009865
Tim Peters05eba1f2004-08-27 21:32:02 +00009866 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009868 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009872 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009874 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875 return NULL;
9876}
9877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878#define FILL(kind, data, value, start, length) \
9879 do { \
9880 Py_ssize_t i_ = 0; \
9881 assert(kind != PyUnicode_WCHAR_KIND); \
9882 switch ((kind)) { \
9883 case PyUnicode_1BYTE_KIND: { \
9884 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9885 memset(to_, (unsigned char)value, length); \
9886 break; \
9887 } \
9888 case PyUnicode_2BYTE_KIND: { \
9889 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9890 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9891 break; \
9892 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009893 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9895 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9896 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009897 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 } \
9899 } \
9900 } while (0)
9901
Victor Stinner3fe55312012-01-04 00:33:50 +01009902Py_ssize_t
9903PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9904 Py_UCS4 fill_char)
9905{
9906 Py_ssize_t maxlen;
9907 enum PyUnicode_Kind kind;
9908 void *data;
9909
9910 if (!PyUnicode_Check(unicode)) {
9911 PyErr_BadInternalCall();
9912 return -1;
9913 }
9914 if (PyUnicode_READY(unicode) == -1)
9915 return -1;
9916 if (unicode_check_modifiable(unicode))
9917 return -1;
9918
9919 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9920 PyErr_SetString(PyExc_ValueError,
9921 "fill character is bigger than "
9922 "the string maximum character");
9923 return -1;
9924 }
9925
9926 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9927 length = Py_MIN(maxlen, length);
9928 if (length <= 0)
9929 return 0;
9930
9931 kind = PyUnicode_KIND(unicode);
9932 data = PyUnicode_DATA(unicode);
9933 FILL(kind, data, fill_char, start, length);
9934 return length;
9935}
9936
Victor Stinner9310abb2011-10-05 00:59:23 +02009937static PyObject *
9938pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009939 Py_ssize_t left,
9940 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 PyObject *u;
9944 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009945 int kind;
9946 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947
9948 if (left < 0)
9949 left = 0;
9950 if (right < 0)
9951 right = 0;
9952
Victor Stinnerc4b49542011-12-11 22:44:26 +01009953 if (left == 0 && right == 0)
9954 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9957 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009958 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9959 return NULL;
9960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9962 if (fill > maxchar)
9963 maxchar = fill;
9964 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009965 if (!u)
9966 return NULL;
9967
9968 kind = PyUnicode_KIND(u);
9969 data = PyUnicode_DATA(u);
9970 if (left)
9971 FILL(kind, data, fill, 0, left);
9972 if (right)
9973 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009974 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009975 assert(_PyUnicode_CheckConsistency(u, 1));
9976 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979
Alexander Belopolsky40018472011-02-26 01:02:56 +00009980PyObject *
9981PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009983 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984
9985 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009986 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009988 if (PyUnicode_READY(string) == -1) {
9989 Py_DECREF(string);
9990 return NULL;
9991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992
Benjamin Petersonead6b532011-12-20 17:23:42 -06009993 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009995 if (PyUnicode_IS_ASCII(string))
9996 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009997 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009998 PyUnicode_GET_LENGTH(string), keepends);
9999 else
10000 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010001 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010002 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 break;
10004 case PyUnicode_2BYTE_KIND:
10005 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010006 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 PyUnicode_GET_LENGTH(string), keepends);
10008 break;
10009 case PyUnicode_4BYTE_KIND:
10010 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010011 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 PyUnicode_GET_LENGTH(string), keepends);
10013 break;
10014 default:
10015 assert(0);
10016 list = 0;
10017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 Py_DECREF(string);
10019 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020}
10021
Alexander Belopolsky40018472011-02-26 01:02:56 +000010022static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010023split(PyObject *self,
10024 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010025 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 int kind1, kind2, kind;
10028 void *buf1, *buf2;
10029 Py_ssize_t len1, len2;
10030 PyObject* out;
10031
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010033 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 if (PyUnicode_READY(self) == -1)
10036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010039 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010041 if (PyUnicode_IS_ASCII(self))
10042 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010043 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010044 PyUnicode_GET_LENGTH(self), maxcount
10045 );
10046 else
10047 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010048 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010049 PyUnicode_GET_LENGTH(self), maxcount
10050 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 case PyUnicode_2BYTE_KIND:
10052 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010053 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 PyUnicode_GET_LENGTH(self), maxcount
10055 );
10056 case PyUnicode_4BYTE_KIND:
10057 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010058 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 PyUnicode_GET_LENGTH(self), maxcount
10060 );
10061 default:
10062 assert(0);
10063 return NULL;
10064 }
10065
10066 if (PyUnicode_READY(substring) == -1)
10067 return NULL;
10068
10069 kind1 = PyUnicode_KIND(self);
10070 kind2 = PyUnicode_KIND(substring);
10071 kind = kind1 > kind2 ? kind1 : kind2;
10072 buf1 = PyUnicode_DATA(self);
10073 buf2 = PyUnicode_DATA(substring);
10074 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010075 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 if (!buf1)
10077 return NULL;
10078 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010079 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 if (!buf2) {
10081 if (kind1 != kind) PyMem_Free(buf1);
10082 return NULL;
10083 }
10084 len1 = PyUnicode_GET_LENGTH(self);
10085 len2 = PyUnicode_GET_LENGTH(substring);
10086
Benjamin Petersonead6b532011-12-20 17:23:42 -060010087 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010089 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10090 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010091 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010092 else
10093 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010094 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 break;
10096 case PyUnicode_2BYTE_KIND:
10097 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010098 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 break;
10100 case PyUnicode_4BYTE_KIND:
10101 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010102 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 break;
10104 default:
10105 out = NULL;
10106 }
10107 if (kind1 != kind)
10108 PyMem_Free(buf1);
10109 if (kind2 != kind)
10110 PyMem_Free(buf2);
10111 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112}
10113
Alexander Belopolsky40018472011-02-26 01:02:56 +000010114static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010115rsplit(PyObject *self,
10116 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010117 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 int kind1, kind2, kind;
10120 void *buf1, *buf2;
10121 Py_ssize_t len1, len2;
10122 PyObject* out;
10123
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010124 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010125 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 if (PyUnicode_READY(self) == -1)
10128 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010131 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010133 if (PyUnicode_IS_ASCII(self))
10134 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010135 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010136 PyUnicode_GET_LENGTH(self), maxcount
10137 );
10138 else
10139 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010140 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010141 PyUnicode_GET_LENGTH(self), maxcount
10142 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 case PyUnicode_2BYTE_KIND:
10144 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010145 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 PyUnicode_GET_LENGTH(self), maxcount
10147 );
10148 case PyUnicode_4BYTE_KIND:
10149 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010150 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 PyUnicode_GET_LENGTH(self), maxcount
10152 );
10153 default:
10154 assert(0);
10155 return NULL;
10156 }
10157
10158 if (PyUnicode_READY(substring) == -1)
10159 return NULL;
10160
10161 kind1 = PyUnicode_KIND(self);
10162 kind2 = PyUnicode_KIND(substring);
10163 kind = kind1 > kind2 ? kind1 : kind2;
10164 buf1 = PyUnicode_DATA(self);
10165 buf2 = PyUnicode_DATA(substring);
10166 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010167 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 if (!buf1)
10169 return NULL;
10170 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010171 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (!buf2) {
10173 if (kind1 != kind) PyMem_Free(buf1);
10174 return NULL;
10175 }
10176 len1 = PyUnicode_GET_LENGTH(self);
10177 len2 = PyUnicode_GET_LENGTH(substring);
10178
Benjamin Petersonead6b532011-12-20 17:23:42 -060010179 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010181 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10182 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010183 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010184 else
10185 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010186 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 break;
10188 case PyUnicode_2BYTE_KIND:
10189 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010190 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 break;
10192 case PyUnicode_4BYTE_KIND:
10193 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 break;
10196 default:
10197 out = NULL;
10198 }
10199 if (kind1 != kind)
10200 PyMem_Free(buf1);
10201 if (kind2 != kind)
10202 PyMem_Free(buf2);
10203 return out;
10204}
10205
10206static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010207anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10208 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010210 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10213 return asciilib_find(buf1, len1, buf2, len2, offset);
10214 else
10215 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 case PyUnicode_2BYTE_KIND:
10217 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10218 case PyUnicode_4BYTE_KIND:
10219 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10220 }
10221 assert(0);
10222 return -1;
10223}
10224
10225static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010226anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10227 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010229 switch (kind) {
10230 case PyUnicode_1BYTE_KIND:
10231 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10232 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10233 else
10234 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10235 case PyUnicode_2BYTE_KIND:
10236 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10237 case PyUnicode_4BYTE_KIND:
10238 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10239 }
10240 assert(0);
10241 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010242}
10243
Alexander Belopolsky40018472011-02-26 01:02:56 +000010244static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245replace(PyObject *self, PyObject *str1,
10246 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 PyObject *u;
10249 char *sbuf = PyUnicode_DATA(self);
10250 char *buf1 = PyUnicode_DATA(str1);
10251 char *buf2 = PyUnicode_DATA(str2);
10252 int srelease = 0, release1 = 0, release2 = 0;
10253 int skind = PyUnicode_KIND(self);
10254 int kind1 = PyUnicode_KIND(str1);
10255 int kind2 = PyUnicode_KIND(str2);
10256 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10257 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10258 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010259 int mayshrink;
10260 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261
10262 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010263 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010265 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266
Victor Stinner59de0ee2011-10-07 10:01:28 +020010267 if (str1 == str2)
10268 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (skind < kind1)
10270 /* substring too wide to be present */
10271 goto nothing;
10272
Victor Stinner49a0a212011-10-12 23:46:10 +020010273 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10274 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10275 /* Replacing str1 with str2 may cause a maxchar reduction in the
10276 result string. */
10277 mayshrink = (maxchar_str2 < maxchar);
10278 maxchar = Py_MAX(maxchar, maxchar_str2);
10279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010281 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010283 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010285 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010286 Py_UCS4 u1, u2;
10287 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010288 Py_ssize_t index, pos;
10289 char *src;
10290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010292 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10293 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010297 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010299 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010301
10302 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10303 index = 0;
10304 src = sbuf;
10305 while (--maxcount)
10306 {
10307 pos++;
10308 src += pos * PyUnicode_KIND(self);
10309 slen -= pos;
10310 index += pos;
10311 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10312 if (pos < 0)
10313 break;
10314 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10315 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010316 }
10317 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 int rkind = skind;
10319 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010320 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (kind1 < rkind) {
10323 /* widen substring */
10324 buf1 = _PyUnicode_AsKind(str1, rkind);
10325 if (!buf1) goto error;
10326 release1 = 1;
10327 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010329 if (i < 0)
10330 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 if (rkind > kind2) {
10332 /* widen replacement */
10333 buf2 = _PyUnicode_AsKind(str2, rkind);
10334 if (!buf2) goto error;
10335 release2 = 1;
10336 }
10337 else if (rkind < kind2) {
10338 /* widen self and buf1 */
10339 rkind = kind2;
10340 if (release1) PyMem_Free(buf1);
10341 sbuf = _PyUnicode_AsKind(self, rkind);
10342 if (!sbuf) goto error;
10343 srelease = 1;
10344 buf1 = _PyUnicode_AsKind(str1, rkind);
10345 if (!buf1) goto error;
10346 release1 = 1;
10347 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010348 u = PyUnicode_New(slen, maxchar);
10349 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010351 assert(PyUnicode_KIND(u) == rkind);
10352 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010353
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010355 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010356 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010358 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010360
10361 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010362 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010363 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010364 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010365 if (i == -1)
10366 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010367 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010369 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010373 }
10374 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 Py_ssize_t n, i, j, ires;
10376 Py_ssize_t product, new_size;
10377 int rkind = skind;
10378 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010381 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 buf1 = _PyUnicode_AsKind(str1, rkind);
10383 if (!buf1) goto error;
10384 release1 = 1;
10385 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010386 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010387 if (n == 0)
10388 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010390 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 buf2 = _PyUnicode_AsKind(str2, rkind);
10392 if (!buf2) goto error;
10393 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010396 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 rkind = kind2;
10398 sbuf = _PyUnicode_AsKind(self, rkind);
10399 if (!sbuf) goto error;
10400 srelease = 1;
10401 if (release1) PyMem_Free(buf1);
10402 buf1 = _PyUnicode_AsKind(str1, rkind);
10403 if (!buf1) goto error;
10404 release1 = 1;
10405 }
10406 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10407 PyUnicode_GET_LENGTH(str1))); */
10408 product = n * (len2-len1);
10409 if ((product / (len2-len1)) != n) {
10410 PyErr_SetString(PyExc_OverflowError,
10411 "replace string is too long");
10412 goto error;
10413 }
10414 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010415 if (new_size == 0) {
10416 Py_INCREF(unicode_empty);
10417 u = unicode_empty;
10418 goto done;
10419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10421 PyErr_SetString(PyExc_OverflowError,
10422 "replace string is too long");
10423 goto error;
10424 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010425 u = PyUnicode_New(new_size, maxchar);
10426 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010428 assert(PyUnicode_KIND(u) == rkind);
10429 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 ires = i = 0;
10431 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010432 while (n-- > 0) {
10433 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010434 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010435 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010436 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010437 if (j == -1)
10438 break;
10439 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010440 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010441 memcpy(res + rkind * ires,
10442 sbuf + rkind * i,
10443 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010445 }
10446 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010448 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010450 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010456 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010457 memcpy(res + rkind * ires,
10458 sbuf + rkind * i,
10459 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010460 }
10461 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010462 /* interleave */
10463 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010464 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010466 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010468 if (--n <= 0)
10469 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010470 memcpy(res + rkind * ires,
10471 sbuf + rkind * i,
10472 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 ires++;
10474 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010475 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010476 memcpy(res + rkind * ires,
10477 sbuf + rkind * i,
10478 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010479 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010480 }
10481
10482 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010483 unicode_adjust_maxchar(&u);
10484 if (u == NULL)
10485 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010487
10488 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (srelease)
10490 PyMem_FREE(sbuf);
10491 if (release1)
10492 PyMem_FREE(buf1);
10493 if (release2)
10494 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010495 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497
Benjamin Peterson29060642009-01-31 22:14:21 +000010498 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010499 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 if (srelease)
10501 PyMem_FREE(sbuf);
10502 if (release1)
10503 PyMem_FREE(buf1);
10504 if (release2)
10505 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010506 return unicode_result_unchanged(self);
10507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 error:
10509 if (srelease && sbuf)
10510 PyMem_FREE(sbuf);
10511 if (release1 && buf1)
10512 PyMem_FREE(buf1);
10513 if (release2 && buf2)
10514 PyMem_FREE(buf2);
10515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516}
10517
10518/* --- Unicode Object Methods --------------------------------------------- */
10519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010520PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010521 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522\n\
10523Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010524characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525
10526static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010527unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010529 if (PyUnicode_READY(self) == -1)
10530 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010531 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532}
10533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010534PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010535 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536\n\
10537Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010538have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539
10540static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010541unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010543 if (PyUnicode_READY(self) == -1)
10544 return NULL;
10545 if (PyUnicode_GET_LENGTH(self) == 0)
10546 return unicode_result_unchanged(self);
10547 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548}
10549
Benjamin Petersond5890c82012-01-14 13:23:30 -050010550PyDoc_STRVAR(casefold__doc__,
10551 "S.casefold() -> str\n\
10552\n\
10553Return a version of S suitable for caseless comparisons.");
10554
10555static PyObject *
10556unicode_casefold(PyObject *self)
10557{
10558 if (PyUnicode_READY(self) == -1)
10559 return NULL;
10560 if (PyUnicode_IS_ASCII(self))
10561 return ascii_upper_or_lower(self, 1);
10562 return case_operation(self, do_casefold);
10563}
10564
10565
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010566/* Argument converter. Coerces to a single unicode character */
10567
10568static int
10569convert_uc(PyObject *obj, void *addr)
10570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010572 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010573
Benjamin Peterson14339b62009-01-31 16:36:08 +000010574 uniobj = PyUnicode_FromObject(obj);
10575 if (uniobj == NULL) {
10576 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010577 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010578 return 0;
10579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010581 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010583 Py_DECREF(uniobj);
10584 return 0;
10585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010587 Py_DECREF(uniobj);
10588 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010589}
10590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010591PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010592 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010594Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010595done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596
10597static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010598unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010600 Py_ssize_t marg, left;
10601 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 Py_UCS4 fillchar = ' ';
10603
Victor Stinnere9a29352011-10-01 02:14:59 +020010604 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606
Benjamin Petersonbac79492012-01-14 13:34:47 -050010607 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608 return NULL;
10609
Victor Stinnerc4b49542011-12-11 22:44:26 +010010610 if (PyUnicode_GET_LENGTH(self) >= width)
10611 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612
Victor Stinnerc4b49542011-12-11 22:44:26 +010010613 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614 left = marg / 2 + (marg & width & 1);
10615
Victor Stinner9310abb2011-10-05 00:59:23 +020010616 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617}
10618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619/* This function assumes that str1 and str2 are readied by the caller. */
10620
Marc-André Lemburge5034372000-08-08 08:04:29 +000010621static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010622unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 int kind1, kind2;
10625 void *data1, *data2;
10626 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 kind1 = PyUnicode_KIND(str1);
10629 kind2 = PyUnicode_KIND(str2);
10630 data1 = PyUnicode_DATA(str1);
10631 data2 = PyUnicode_DATA(str2);
10632 len1 = PyUnicode_GET_LENGTH(str1);
10633 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 for (i = 0; i < len1 && i < len2; ++i) {
10636 Py_UCS4 c1, c2;
10637 c1 = PyUnicode_READ(kind1, data1, i);
10638 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010639
10640 if (c1 != c2)
10641 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010642 }
10643
10644 return (len1 < len2) ? -1 : (len1 != len2);
10645}
10646
Alexander Belopolsky40018472011-02-26 01:02:56 +000010647int
10648PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10651 if (PyUnicode_READY(left) == -1 ||
10652 PyUnicode_READY(right) == -1)
10653 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010654 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010656 PyErr_Format(PyExc_TypeError,
10657 "Can't compare %.100s and %.100s",
10658 left->ob_type->tp_name,
10659 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660 return -1;
10661}
10662
Martin v. Löwis5b222132007-06-10 09:51:05 +000010663int
10664PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10665{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 Py_ssize_t i;
10667 int kind;
10668 void *data;
10669 Py_UCS4 chr;
10670
Victor Stinner910337b2011-10-03 03:20:16 +020010671 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (PyUnicode_READY(uni) == -1)
10673 return -1;
10674 kind = PyUnicode_KIND(uni);
10675 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010676 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10678 if (chr != str[i])
10679 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010680 /* This check keeps Python strings that end in '\0' from comparing equal
10681 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010683 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010684 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010685 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010686 return 0;
10687}
10688
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010689
Benjamin Peterson29060642009-01-31 22:14:21 +000010690#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010691 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010692
Alexander Belopolsky40018472011-02-26 01:02:56 +000010693PyObject *
10694PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010695{
10696 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010697
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010698 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10699 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 if (PyUnicode_READY(left) == -1 ||
10701 PyUnicode_READY(right) == -1)
10702 return NULL;
10703 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10704 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010705 if (op == Py_EQ) {
10706 Py_INCREF(Py_False);
10707 return Py_False;
10708 }
10709 if (op == Py_NE) {
10710 Py_INCREF(Py_True);
10711 return Py_True;
10712 }
10713 }
10714 if (left == right)
10715 result = 0;
10716 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010717 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010718
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010719 /* Convert the return value to a Boolean */
10720 switch (op) {
10721 case Py_EQ:
10722 v = TEST_COND(result == 0);
10723 break;
10724 case Py_NE:
10725 v = TEST_COND(result != 0);
10726 break;
10727 case Py_LE:
10728 v = TEST_COND(result <= 0);
10729 break;
10730 case Py_GE:
10731 v = TEST_COND(result >= 0);
10732 break;
10733 case Py_LT:
10734 v = TEST_COND(result == -1);
10735 break;
10736 case Py_GT:
10737 v = TEST_COND(result == 1);
10738 break;
10739 default:
10740 PyErr_BadArgument();
10741 return NULL;
10742 }
10743 Py_INCREF(v);
10744 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010745 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010746
Brian Curtindfc80e32011-08-10 20:28:54 -050010747 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010748}
10749
Alexander Belopolsky40018472011-02-26 01:02:56 +000010750int
10751PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010752{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010753 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 int kind1, kind2, kind;
10755 void *buf1, *buf2;
10756 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010757 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010758
10759 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010760 sub = PyUnicode_FromObject(element);
10761 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010762 PyErr_Format(PyExc_TypeError,
10763 "'in <string>' requires string as left operand, not %s",
10764 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010765 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010766 }
10767
Thomas Wouters477c8d52006-05-27 19:21:47 +000010768 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010769 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010770 Py_DECREF(sub);
10771 return -1;
10772 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010773 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10774 Py_DECREF(sub);
10775 Py_DECREF(str);
10776 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 kind1 = PyUnicode_KIND(str);
10779 kind2 = PyUnicode_KIND(sub);
10780 kind = kind1 > kind2 ? kind1 : kind2;
10781 buf1 = PyUnicode_DATA(str);
10782 buf2 = PyUnicode_DATA(sub);
10783 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010784 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 if (!buf1) {
10786 Py_DECREF(sub);
10787 return -1;
10788 }
10789 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010790 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if (!buf2) {
10792 Py_DECREF(sub);
10793 if (kind1 != kind) PyMem_Free(buf1);
10794 return -1;
10795 }
10796 len1 = PyUnicode_GET_LENGTH(str);
10797 len2 = PyUnicode_GET_LENGTH(sub);
10798
Benjamin Petersonead6b532011-12-20 17:23:42 -060010799 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 case PyUnicode_1BYTE_KIND:
10801 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10802 break;
10803 case PyUnicode_2BYTE_KIND:
10804 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10805 break;
10806 case PyUnicode_4BYTE_KIND:
10807 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10808 break;
10809 default:
10810 result = -1;
10811 assert(0);
10812 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010813
10814 Py_DECREF(str);
10815 Py_DECREF(sub);
10816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (kind1 != kind)
10818 PyMem_Free(buf1);
10819 if (kind2 != kind)
10820 PyMem_Free(buf2);
10821
Guido van Rossum403d68b2000-03-13 15:55:09 +000010822 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010823}
10824
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825/* Concat to string or Unicode object giving a new Unicode object. */
10826
Alexander Belopolsky40018472011-02-26 01:02:56 +000010827PyObject *
10828PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010831 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010832 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833
10834 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010837 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010840 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
10842 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010843 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010844 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010847 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010848 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850 }
10851
Victor Stinner488fa492011-12-12 00:01:39 +010010852 u_len = PyUnicode_GET_LENGTH(u);
10853 v_len = PyUnicode_GET_LENGTH(v);
10854 if (u_len > PY_SSIZE_T_MAX - v_len) {
10855 PyErr_SetString(PyExc_OverflowError,
10856 "strings are too large to concat");
10857 goto onError;
10858 }
10859 new_len = u_len + v_len;
10860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010862 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10863 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010866 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010868 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010869 copy_characters(w, 0, u, 0, u_len);
10870 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 Py_DECREF(u);
10872 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010873 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877 Py_XDECREF(u);
10878 Py_XDECREF(v);
10879 return NULL;
10880}
10881
Walter Dörwald1ab83302007-05-18 17:15:44 +000010882void
Victor Stinner23e56682011-10-03 03:54:37 +020010883PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010884{
Victor Stinner23e56682011-10-03 03:54:37 +020010885 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010886 Py_UCS4 maxchar, maxchar2;
10887 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010888
10889 if (p_left == NULL) {
10890 if (!PyErr_Occurred())
10891 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010892 return;
10893 }
Victor Stinner23e56682011-10-03 03:54:37 +020010894 left = *p_left;
10895 if (right == NULL || !PyUnicode_Check(left)) {
10896 if (!PyErr_Occurred())
10897 PyErr_BadInternalCall();
10898 goto error;
10899 }
10900
Benjamin Petersonbac79492012-01-14 13:34:47 -050010901 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010902 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010903 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010904 goto error;
10905
Victor Stinner488fa492011-12-12 00:01:39 +010010906 /* Shortcuts */
10907 if (left == unicode_empty) {
10908 Py_DECREF(left);
10909 Py_INCREF(right);
10910 *p_left = right;
10911 return;
10912 }
10913 if (right == unicode_empty)
10914 return;
10915
10916 left_len = PyUnicode_GET_LENGTH(left);
10917 right_len = PyUnicode_GET_LENGTH(right);
10918 if (left_len > PY_SSIZE_T_MAX - right_len) {
10919 PyErr_SetString(PyExc_OverflowError,
10920 "strings are too large to concat");
10921 goto error;
10922 }
10923 new_len = left_len + right_len;
10924
10925 if (unicode_modifiable(left)
10926 && PyUnicode_CheckExact(right)
10927 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010928 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10929 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010930 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010931 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010932 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10933 {
10934 /* append inplace */
10935 if (unicode_resize(p_left, new_len) != 0) {
10936 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10937 * deallocated so it cannot be put back into
10938 * 'variable'. The MemoryError is raised when there
10939 * is no value in 'variable', which might (very
10940 * remotely) be a cause of incompatibilities.
10941 */
10942 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010943 }
Victor Stinner488fa492011-12-12 00:01:39 +010010944 /* copy 'right' into the newly allocated area of 'left' */
10945 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010946 }
Victor Stinner488fa492011-12-12 00:01:39 +010010947 else {
10948 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10949 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10950 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010951
Victor Stinner488fa492011-12-12 00:01:39 +010010952 /* Concat the two Unicode strings */
10953 res = PyUnicode_New(new_len, maxchar);
10954 if (res == NULL)
10955 goto error;
10956 copy_characters(res, 0, left, 0, left_len);
10957 copy_characters(res, left_len, right, 0, right_len);
10958 Py_DECREF(left);
10959 *p_left = res;
10960 }
10961 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010962 return;
10963
10964error:
Victor Stinner488fa492011-12-12 00:01:39 +010010965 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010966}
10967
10968void
10969PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10970{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010971 PyUnicode_Append(pleft, right);
10972 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010973}
10974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010975PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010978Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010979string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010980interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981
10982static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010983unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010985 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010986 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010987 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 int kind1, kind2, kind;
10990 void *buf1, *buf2;
10991 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992
Jesus Ceaac451502011-04-20 17:09:23 +020010993 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10994 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 kind1 = PyUnicode_KIND(self);
10998 kind2 = PyUnicode_KIND(substring);
10999 kind = kind1 > kind2 ? kind1 : kind2;
11000 buf1 = PyUnicode_DATA(self);
11001 buf2 = PyUnicode_DATA(substring);
11002 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011003 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 if (!buf1) {
11005 Py_DECREF(substring);
11006 return NULL;
11007 }
11008 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011009 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 if (!buf2) {
11011 Py_DECREF(substring);
11012 if (kind1 != kind) PyMem_Free(buf1);
11013 return NULL;
11014 }
11015 len1 = PyUnicode_GET_LENGTH(self);
11016 len2 = PyUnicode_GET_LENGTH(substring);
11017
11018 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011019 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 case PyUnicode_1BYTE_KIND:
11021 iresult = ucs1lib_count(
11022 ((Py_UCS1*)buf1) + start, end - start,
11023 buf2, len2, PY_SSIZE_T_MAX
11024 );
11025 break;
11026 case PyUnicode_2BYTE_KIND:
11027 iresult = ucs2lib_count(
11028 ((Py_UCS2*)buf1) + start, end - start,
11029 buf2, len2, PY_SSIZE_T_MAX
11030 );
11031 break;
11032 case PyUnicode_4BYTE_KIND:
11033 iresult = ucs4lib_count(
11034 ((Py_UCS4*)buf1) + start, end - start,
11035 buf2, len2, PY_SSIZE_T_MAX
11036 );
11037 break;
11038 default:
11039 assert(0); iresult = 0;
11040 }
11041
11042 result = PyLong_FromSsize_t(iresult);
11043
11044 if (kind1 != kind)
11045 PyMem_Free(buf1);
11046 if (kind2 != kind)
11047 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048
11049 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011050
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 return result;
11052}
11053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011054PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011055 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011057Encode S using the codec registered for encoding. Default encoding\n\
11058is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011059handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011060a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11061'xmlcharrefreplace' as well as any other name registered with\n\
11062codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
11064static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011065unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011067 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068 char *encoding = NULL;
11069 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011070
Benjamin Peterson308d6372009-09-18 21:42:35 +000011071 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11072 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011074 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011075}
11076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011077PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011078 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079\n\
11080Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011081If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082
11083static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011084unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011086 Py_ssize_t i, j, line_pos, src_len, incr;
11087 Py_UCS4 ch;
11088 PyObject *u;
11089 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011091 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011092 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093
11094 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096
Antoine Pitrou22425222011-10-04 19:10:51 +020011097 if (PyUnicode_READY(self) == -1)
11098 return NULL;
11099
Thomas Wouters7e474022000-07-16 12:04:32 +000011100 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011101 src_len = PyUnicode_GET_LENGTH(self);
11102 i = j = line_pos = 0;
11103 kind = PyUnicode_KIND(self);
11104 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011105 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011106 for (; i < src_len; i++) {
11107 ch = PyUnicode_READ(kind, src_data, i);
11108 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011109 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011110 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011111 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011112 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011113 goto overflow;
11114 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011115 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011116 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011119 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011120 goto overflow;
11121 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011123 if (ch == '\n' || ch == '\r')
11124 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011126 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011127 if (!found)
11128 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011129
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011131 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132 if (!u)
11133 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011134 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
Antoine Pitroue71d5742011-10-04 15:55:09 +020011136 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
Antoine Pitroue71d5742011-10-04 15:55:09 +020011138 for (; i < src_len; i++) {
11139 ch = PyUnicode_READ(kind, src_data, i);
11140 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011141 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011142 incr = tabsize - (line_pos % tabsize);
11143 line_pos += incr;
11144 while (incr--) {
11145 PyUnicode_WRITE(kind, dest_data, j, ' ');
11146 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011147 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011149 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011151 line_pos++;
11152 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011153 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011154 if (ch == '\n' || ch == '\r')
11155 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011157 }
11158 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011159 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011160
Antoine Pitroue71d5742011-10-04 15:55:09 +020011161 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011162 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164}
11165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011166PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011167 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168\n\
11169Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011170such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171arguments start and end are interpreted as in slice notation.\n\
11172\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011173Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174
11175static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011178 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011179 Py_ssize_t start;
11180 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011181 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182
Jesus Ceaac451502011-04-20 17:09:23 +020011183 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11184 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 if (PyUnicode_READY(self) == -1)
11188 return NULL;
11189 if (PyUnicode_READY(substring) == -1)
11190 return NULL;
11191
Victor Stinner7931d9a2011-11-04 00:22:48 +010011192 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193
11194 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (result == -2)
11197 return NULL;
11198
Christian Heimes217cfd12007-12-02 14:31:20 +000011199 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200}
11201
11202static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011203unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011205 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11206 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209}
11210
Guido van Rossumc2504932007-09-18 19:42:40 +000011211/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011212 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011213static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011214unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215{
Guido van Rossumc2504932007-09-18 19:42:40 +000011216 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011217 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011218
Benjamin Peterson69e97272012-02-21 11:08:50 -050011219 assert(_Py_HashSecret_Initialized);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 if (_PyUnicode_HASH(self) != -1)
11221 return _PyUnicode_HASH(self);
11222 if (PyUnicode_READY(self) == -1)
11223 return -1;
11224 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011225 /*
11226 We make the hash of the empty string be 0, rather than using
11227 (prefix ^ suffix), since this slightly obfuscates the hash secret
11228 */
11229 if (len == 0) {
11230 _PyUnicode_HASH(self) = 0;
11231 return 0;
11232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233
11234 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011235#define HASH(P) \
11236 x ^= (Py_uhash_t) *P << 7; \
11237 while (--len >= 0) \
11238 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239
Georg Brandl2fb477c2012-02-21 00:33:36 +010011240 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 switch (PyUnicode_KIND(self)) {
11242 case PyUnicode_1BYTE_KIND: {
11243 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11244 HASH(c);
11245 break;
11246 }
11247 case PyUnicode_2BYTE_KIND: {
11248 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11249 HASH(s);
11250 break;
11251 }
11252 default: {
11253 Py_UCS4 *l;
11254 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11255 "Impossible switch case in unicode_hash");
11256 l = PyUnicode_4BYTE_DATA(self);
11257 HASH(l);
11258 break;
11259 }
11260 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011261 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11262 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263
Guido van Rossumc2504932007-09-18 19:42:40 +000011264 if (x == -1)
11265 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011267 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011274Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275
11276static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011279 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011280 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011281 Py_ssize_t start;
11282 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
Jesus Ceaac451502011-04-20 17:09:23 +020011284 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11285 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 if (PyUnicode_READY(self) == -1)
11289 return NULL;
11290 if (PyUnicode_READY(substring) == -1)
11291 return NULL;
11292
Victor Stinner7931d9a2011-11-04 00:22:48 +010011293 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
11295 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 if (result == -2)
11298 return NULL;
11299
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 if (result < 0) {
11301 PyErr_SetString(PyExc_ValueError, "substring not found");
11302 return NULL;
11303 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011304
Christian Heimes217cfd12007-12-02 14:31:20 +000011305 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306}
11307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011308PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011311Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011312at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
11314static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011315unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 Py_ssize_t i, length;
11318 int kind;
11319 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320 int cased;
11321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 if (PyUnicode_READY(self) == -1)
11323 return NULL;
11324 length = PyUnicode_GET_LENGTH(self);
11325 kind = PyUnicode_KIND(self);
11326 data = PyUnicode_DATA(self);
11327
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 if (length == 1)
11330 return PyBool_FromLong(
11331 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011333 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011336
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 for (i = 0; i < length; i++) {
11339 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011340
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11342 return PyBool_FromLong(0);
11343 else if (!cased && Py_UNICODE_ISLOWER(ch))
11344 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011346 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347}
11348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011352Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
11355static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011356unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 Py_ssize_t i, length;
11359 int kind;
11360 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361 int cased;
11362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 if (PyUnicode_READY(self) == -1)
11364 return NULL;
11365 length = PyUnicode_GET_LENGTH(self);
11366 kind = PyUnicode_KIND(self);
11367 data = PyUnicode_DATA(self);
11368
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 if (length == 1)
11371 return PyBool_FromLong(
11372 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011374 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011377
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 for (i = 0; i < length; i++) {
11380 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011381
Benjamin Peterson29060642009-01-31 22:14:21 +000011382 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11383 return PyBool_FromLong(0);
11384 else if (!cased && Py_UNICODE_ISUPPER(ch))
11385 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011387 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388}
11389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011390PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011391 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011393Return True if S is a titlecased string and there is at least one\n\
11394character in S, i.e. upper- and titlecase characters may only\n\
11395follow uncased characters and lowercase characters only cased ones.\n\
11396Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397
11398static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011399unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 Py_ssize_t i, length;
11402 int kind;
11403 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404 int cased, previous_is_cased;
11405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 if (PyUnicode_READY(self) == -1)
11407 return NULL;
11408 length = PyUnicode_GET_LENGTH(self);
11409 kind = PyUnicode_KIND(self);
11410 data = PyUnicode_DATA(self);
11411
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 if (length == 1) {
11414 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11415 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11416 (Py_UNICODE_ISUPPER(ch) != 0));
11417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011419 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011422
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 cased = 0;
11424 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 for (i = 0; i < length; i++) {
11426 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011427
Benjamin Peterson29060642009-01-31 22:14:21 +000011428 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11429 if (previous_is_cased)
11430 return PyBool_FromLong(0);
11431 previous_is_cased = 1;
11432 cased = 1;
11433 }
11434 else if (Py_UNICODE_ISLOWER(ch)) {
11435 if (!previous_is_cased)
11436 return PyBool_FromLong(0);
11437 previous_is_cased = 1;
11438 cased = 1;
11439 }
11440 else
11441 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011443 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444}
11445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011446PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011447 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011449Return True if all characters in S are whitespace\n\
11450and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
11452static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011453unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 Py_ssize_t i, length;
11456 int kind;
11457 void *data;
11458
11459 if (PyUnicode_READY(self) == -1)
11460 return NULL;
11461 length = PyUnicode_GET_LENGTH(self);
11462 kind = PyUnicode_KIND(self);
11463 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 if (length == 1)
11467 return PyBool_FromLong(
11468 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011470 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 for (i = 0; i < length; i++) {
11475 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011476 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011479 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480}
11481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011482PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011484\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011485Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011486and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011487
11488static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011489unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011490{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 Py_ssize_t i, length;
11492 int kind;
11493 void *data;
11494
11495 if (PyUnicode_READY(self) == -1)
11496 return NULL;
11497 length = PyUnicode_GET_LENGTH(self);
11498 kind = PyUnicode_KIND(self);
11499 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011500
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011501 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (length == 1)
11503 return PyBool_FromLong(
11504 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011505
11506 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 for (i = 0; i < length; i++) {
11511 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011513 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011514 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011515}
11516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011517PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011519\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011520Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011521and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011522
11523static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011524unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 int kind;
11527 void *data;
11528 Py_ssize_t len, i;
11529
11530 if (PyUnicode_READY(self) == -1)
11531 return NULL;
11532
11533 kind = PyUnicode_KIND(self);
11534 data = PyUnicode_DATA(self);
11535 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011536
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011537 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 if (len == 1) {
11539 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11540 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11541 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011542
11543 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 for (i = 0; i < len; i++) {
11548 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011549 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011551 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011552 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011553}
11554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011555PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011558Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011559False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560
11561static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011562unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 Py_ssize_t i, length;
11565 int kind;
11566 void *data;
11567
11568 if (PyUnicode_READY(self) == -1)
11569 return NULL;
11570 length = PyUnicode_GET_LENGTH(self);
11571 kind = PyUnicode_KIND(self);
11572 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 if (length == 1)
11576 return PyBool_FromLong(
11577 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011579 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 for (i = 0; i < length; i++) {
11584 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011587 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588}
11589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011590PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011591 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011593Return True if all characters in S are digits\n\
11594and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595
11596static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011597unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 Py_ssize_t i, length;
11600 int kind;
11601 void *data;
11602
11603 if (PyUnicode_READY(self) == -1)
11604 return NULL;
11605 length = PyUnicode_GET_LENGTH(self);
11606 kind = PyUnicode_KIND(self);
11607 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 if (length == 1) {
11611 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11612 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011615 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 for (i = 0; i < length; i++) {
11620 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011621 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011623 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624}
11625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011626PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011629Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011630False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
11632static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011633unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 Py_ssize_t i, length;
11636 int kind;
11637 void *data;
11638
11639 if (PyUnicode_READY(self) == -1)
11640 return NULL;
11641 length = PyUnicode_GET_LENGTH(self);
11642 kind = PyUnicode_KIND(self);
11643 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 if (length == 1)
11647 return PyBool_FromLong(
11648 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011650 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 for (i = 0; i < length; i++) {
11655 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011658 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659}
11660
Martin v. Löwis47383402007-08-15 07:32:56 +000011661int
11662PyUnicode_IsIdentifier(PyObject *self)
11663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 int kind;
11665 void *data;
11666 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011667 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 if (PyUnicode_READY(self) == -1) {
11670 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 }
11673
11674 /* Special case for empty strings */
11675 if (PyUnicode_GET_LENGTH(self) == 0)
11676 return 0;
11677 kind = PyUnicode_KIND(self);
11678 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011679
11680 /* PEP 3131 says that the first character must be in
11681 XID_Start and subsequent characters in XID_Continue,
11682 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011683 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011684 letters, digits, underscore). However, given the current
11685 definition of XID_Start and XID_Continue, it is sufficient
11686 to check just for these, except that _ must be allowed
11687 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011689 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011690 return 0;
11691
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011692 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011694 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011695 return 1;
11696}
11697
11698PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011699 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011700\n\
11701Return True if S is a valid identifier according\n\
11702to the language definition.");
11703
11704static PyObject*
11705unicode_isidentifier(PyObject *self)
11706{
11707 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11708}
11709
Georg Brandl559e5d72008-06-11 18:37:52 +000011710PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011712\n\
11713Return True if all characters in S are considered\n\
11714printable in repr() or S is empty, False otherwise.");
11715
11716static PyObject*
11717unicode_isprintable(PyObject *self)
11718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 Py_ssize_t i, length;
11720 int kind;
11721 void *data;
11722
11723 if (PyUnicode_READY(self) == -1)
11724 return NULL;
11725 length = PyUnicode_GET_LENGTH(self);
11726 kind = PyUnicode_KIND(self);
11727 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011728
11729 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 if (length == 1)
11731 return PyBool_FromLong(
11732 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 for (i = 0; i < length; i++) {
11735 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011736 Py_RETURN_FALSE;
11737 }
11738 }
11739 Py_RETURN_TRUE;
11740}
11741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011742PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011743 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744\n\
11745Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011746iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
11748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011749unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011751 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752}
11753
Martin v. Löwis18e16552006-02-15 17:27:45 +000011754static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011755unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 if (PyUnicode_READY(self) == -1)
11758 return -1;
11759 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760}
11761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011765Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011766done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
11768static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011769unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011771 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 Py_UCS4 fillchar = ' ';
11773
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011774 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 return NULL;
11776
Benjamin Petersonbac79492012-01-14 13:34:47 -050011777 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011778 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Victor Stinnerc4b49542011-12-11 22:44:26 +010011780 if (PyUnicode_GET_LENGTH(self) >= width)
11781 return unicode_result_unchanged(self);
11782
11783 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784}
11785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011786PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011789Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
11791static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011792unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011794 if (PyUnicode_READY(self) == -1)
11795 return NULL;
11796 if (PyUnicode_IS_ASCII(self))
11797 return ascii_upper_or_lower(self, 1);
11798 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799}
11800
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011801#define LEFTSTRIP 0
11802#define RIGHTSTRIP 1
11803#define BOTHSTRIP 2
11804
11805/* Arrays indexed by above */
11806static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11807
11808#define STRIPNAME(i) (stripformat[i]+3)
11809
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810/* externally visible for str.strip(unicode) */
11811PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011812_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 void *data;
11815 int kind;
11816 Py_ssize_t i, j, len;
11817 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11820 return NULL;
11821
11822 kind = PyUnicode_KIND(self);
11823 data = PyUnicode_DATA(self);
11824 len = PyUnicode_GET_LENGTH(self);
11825 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11826 PyUnicode_DATA(sepobj),
11827 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011828
Benjamin Peterson14339b62009-01-31 16:36:08 +000011829 i = 0;
11830 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 while (i < len &&
11832 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 i++;
11834 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011835 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011836
Benjamin Peterson14339b62009-01-31 16:36:08 +000011837 j = len;
11838 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 do {
11840 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 } while (j >= i &&
11842 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011844 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011845
Victor Stinner7931d9a2011-11-04 00:22:48 +010011846 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847}
11848
11849PyObject*
11850PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11851{
11852 unsigned char *data;
11853 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011854 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855
Victor Stinnerde636f32011-10-01 03:55:54 +020011856 if (PyUnicode_READY(self) == -1)
11857 return NULL;
11858
11859 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11860
Victor Stinner12bab6d2011-10-01 01:53:49 +020011861 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011862 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863
Victor Stinner12bab6d2011-10-01 01:53:49 +020011864 length = end - start;
11865 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011866 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867
Victor Stinnerde636f32011-10-01 03:55:54 +020011868 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011869 PyErr_SetString(PyExc_IndexError, "string index out of range");
11870 return NULL;
11871 }
11872
Victor Stinnerb9275c12011-10-05 14:01:42 +020011873 if (PyUnicode_IS_ASCII(self)) {
11874 kind = PyUnicode_KIND(self);
11875 data = PyUnicode_1BYTE_DATA(self);
11876 return unicode_fromascii(data + start, length);
11877 }
11878 else {
11879 kind = PyUnicode_KIND(self);
11880 data = PyUnicode_1BYTE_DATA(self);
11881 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011882 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011883 length);
11884 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
11887static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011888do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 int kind;
11891 void *data;
11892 Py_ssize_t len, i, j;
11893
11894 if (PyUnicode_READY(self) == -1)
11895 return NULL;
11896
11897 kind = PyUnicode_KIND(self);
11898 data = PyUnicode_DATA(self);
11899 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011900
Benjamin Peterson14339b62009-01-31 16:36:08 +000011901 i = 0;
11902 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011904 i++;
11905 }
11906 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011907
Benjamin Peterson14339b62009-01-31 16:36:08 +000011908 j = len;
11909 if (striptype != LEFTSTRIP) {
11910 do {
11911 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011913 j++;
11914 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011915
Victor Stinner7931d9a2011-11-04 00:22:48 +010011916 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917}
11918
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011919
11920static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011921do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011922{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011923 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011924
Benjamin Peterson14339b62009-01-31 16:36:08 +000011925 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11926 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011927
Benjamin Peterson14339b62009-01-31 16:36:08 +000011928 if (sep != NULL && sep != Py_None) {
11929 if (PyUnicode_Check(sep))
11930 return _PyUnicode_XStrip(self, striptype, sep);
11931 else {
11932 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011933 "%s arg must be None or str",
11934 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011935 return NULL;
11936 }
11937 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011938
Benjamin Peterson14339b62009-01-31 16:36:08 +000011939 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011940}
11941
11942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011943PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011945\n\
11946Return a copy of the string S with leading and trailing\n\
11947whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011948If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011949
11950static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011951unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011952{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011953 if (PyTuple_GET_SIZE(args) == 0)
11954 return do_strip(self, BOTHSTRIP); /* Common case */
11955 else
11956 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011957}
11958
11959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011960PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011962\n\
11963Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011964If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011965
11966static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011967unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011968{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011969 if (PyTuple_GET_SIZE(args) == 0)
11970 return do_strip(self, LEFTSTRIP); /* Common case */
11971 else
11972 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011973}
11974
11975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011976PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011978\n\
11979Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011980If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011981
11982static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011983unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011984{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011985 if (PyTuple_GET_SIZE(args) == 0)
11986 return do_strip(self, RIGHTSTRIP); /* Common case */
11987 else
11988 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011989}
11990
11991
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011993unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011995 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997
Georg Brandl222de0f2009-04-12 12:01:50 +000011998 if (len < 1) {
11999 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012000 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002
Victor Stinnerc4b49542011-12-11 22:44:26 +010012003 /* no repeat, return original string */
12004 if (len == 1)
12005 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012006
Benjamin Petersonbac79492012-01-14 13:34:47 -050012007 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 return NULL;
12009
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012010 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012011 PyErr_SetString(PyExc_OverflowError,
12012 "repeated string is too long");
12013 return NULL;
12014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012016
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012017 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018 if (!u)
12019 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012020 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 if (PyUnicode_GET_LENGTH(str) == 1) {
12023 const int kind = PyUnicode_KIND(str);
12024 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012025 if (kind == PyUnicode_1BYTE_KIND) {
12026 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012027 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012028 }
12029 else if (kind == PyUnicode_2BYTE_KIND) {
12030 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012031 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012032 ucs2[n] = fill_char;
12033 } else {
12034 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12035 assert(kind == PyUnicode_4BYTE_KIND);
12036 for (n = 0; n < len; ++n)
12037 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 }
12040 else {
12041 /* number of characters copied this far */
12042 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012043 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 char *to = (char *) PyUnicode_DATA(u);
12045 Py_MEMCPY(to, PyUnicode_DATA(str),
12046 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 n = (done <= nchars-done) ? done : nchars-done;
12049 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012050 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052 }
12053
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012054 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012055 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056}
12057
Alexander Belopolsky40018472011-02-26 01:02:56 +000012058PyObject *
12059PyUnicode_Replace(PyObject *obj,
12060 PyObject *subobj,
12061 PyObject *replobj,
12062 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063{
12064 PyObject *self;
12065 PyObject *str1;
12066 PyObject *str2;
12067 PyObject *result;
12068
12069 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012070 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012073 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 Py_DECREF(self);
12075 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076 }
12077 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012078 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 Py_DECREF(self);
12080 Py_DECREF(str1);
12081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012083 if (PyUnicode_READY(self) == -1 ||
12084 PyUnicode_READY(str1) == -1 ||
12085 PyUnicode_READY(str2) == -1)
12086 result = NULL;
12087 else
12088 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 Py_DECREF(self);
12090 Py_DECREF(str1);
12091 Py_DECREF(str2);
12092 return result;
12093}
12094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012095PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012096 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097\n\
12098Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012099old replaced by new. If the optional argument count is\n\
12100given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101
12102static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 PyObject *str1;
12106 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012107 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108 PyObject *result;
12109
Martin v. Löwis18e16552006-02-15 17:27:45 +000012110 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012112 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012115 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 return NULL;
12117 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012118 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 Py_DECREF(str1);
12120 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012121 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012122 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12123 result = NULL;
12124 else
12125 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126
12127 Py_DECREF(str1);
12128 Py_DECREF(str2);
12129 return result;
12130}
12131
Alexander Belopolsky40018472011-02-26 01:02:56 +000012132static PyObject *
12133unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012135 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 Py_ssize_t isize;
12137 Py_ssize_t osize, squote, dquote, i, o;
12138 Py_UCS4 max, quote;
12139 int ikind, okind;
12140 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012143 return NULL;
12144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 isize = PyUnicode_GET_LENGTH(unicode);
12146 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 /* Compute length of output, quote characters, and
12149 maximum character */
12150 osize = 2; /* quotes */
12151 max = 127;
12152 squote = dquote = 0;
12153 ikind = PyUnicode_KIND(unicode);
12154 for (i = 0; i < isize; i++) {
12155 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12156 switch (ch) {
12157 case '\'': squote++; osize++; break;
12158 case '"': dquote++; osize++; break;
12159 case '\\': case '\t': case '\r': case '\n':
12160 osize += 2; break;
12161 default:
12162 /* Fast-path ASCII */
12163 if (ch < ' ' || ch == 0x7f)
12164 osize += 4; /* \xHH */
12165 else if (ch < 0x7f)
12166 osize++;
12167 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12168 osize++;
12169 max = ch > max ? ch : max;
12170 }
12171 else if (ch < 0x100)
12172 osize += 4; /* \xHH */
12173 else if (ch < 0x10000)
12174 osize += 6; /* \uHHHH */
12175 else
12176 osize += 10; /* \uHHHHHHHH */
12177 }
12178 }
12179
12180 quote = '\'';
12181 if (squote) {
12182 if (dquote)
12183 /* Both squote and dquote present. Use squote,
12184 and escape them */
12185 osize += squote;
12186 else
12187 quote = '"';
12188 }
12189
12190 repr = PyUnicode_New(osize, max);
12191 if (repr == NULL)
12192 return NULL;
12193 okind = PyUnicode_KIND(repr);
12194 odata = PyUnicode_DATA(repr);
12195
12196 PyUnicode_WRITE(okind, odata, 0, quote);
12197 PyUnicode_WRITE(okind, odata, osize-1, quote);
12198
12199 for (i = 0, o = 1; i < isize; i++) {
12200 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012201
12202 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 if ((ch == quote) || (ch == '\\')) {
12204 PyUnicode_WRITE(okind, odata, o++, '\\');
12205 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012206 continue;
12207 }
12208
Benjamin Peterson29060642009-01-31 22:14:21 +000012209 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012210 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 PyUnicode_WRITE(okind, odata, o++, '\\');
12212 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012213 }
12214 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 PyUnicode_WRITE(okind, odata, o++, '\\');
12216 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012217 }
12218 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 PyUnicode_WRITE(okind, odata, o++, '\\');
12220 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012221 }
12222
12223 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012224 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 PyUnicode_WRITE(okind, odata, o++, '\\');
12226 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012227 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12228 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012229 }
12230
Georg Brandl559e5d72008-06-11 18:37:52 +000012231 /* Copy ASCII characters as-is */
12232 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012234 }
12235
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012237 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012238 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012239 (categories Z* and C* except ASCII space)
12240 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012242 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 if (ch <= 0xff) {
12244 PyUnicode_WRITE(okind, odata, o++, '\\');
12245 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012246 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12247 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012248 }
12249 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 else if (ch >= 0x10000) {
12251 PyUnicode_WRITE(okind, odata, o++, '\\');
12252 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012253 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12254 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12255 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12256 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12257 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12258 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12259 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12260 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012261 }
12262 /* Map 16-bit characters to '\uxxxx' */
12263 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 PyUnicode_WRITE(okind, odata, o++, '\\');
12265 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012266 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12267 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12268 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12269 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012270 }
12271 }
12272 /* Copy characters as-is */
12273 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012275 }
12276 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012279 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012280 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281}
12282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012283PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012284 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285\n\
12286Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012287such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288arguments start and end are interpreted as in slice notation.\n\
12289\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012290Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291
12292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012295 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012296 Py_ssize_t start;
12297 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012298 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299
Jesus Ceaac451502011-04-20 17:09:23 +020012300 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12301 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 if (PyUnicode_READY(self) == -1)
12305 return NULL;
12306 if (PyUnicode_READY(substring) == -1)
12307 return NULL;
12308
Victor Stinner7931d9a2011-11-04 00:22:48 +010012309 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310
12311 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 if (result == -2)
12314 return NULL;
12315
Christian Heimes217cfd12007-12-02 14:31:20 +000012316 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317}
12318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012319PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012322Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323
12324static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012327 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012328 Py_ssize_t start;
12329 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012330 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331
Jesus Ceaac451502011-04-20 17:09:23 +020012332 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12333 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 if (PyUnicode_READY(self) == -1)
12337 return NULL;
12338 if (PyUnicode_READY(substring) == -1)
12339 return NULL;
12340
Victor Stinner7931d9a2011-11-04 00:22:48 +010012341 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342
12343 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 if (result == -2)
12346 return NULL;
12347
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348 if (result < 0) {
12349 PyErr_SetString(PyExc_ValueError, "substring not found");
12350 return NULL;
12351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352
Christian Heimes217cfd12007-12-02 14:31:20 +000012353 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354}
12355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012356PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012359Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012360done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361
12362static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012363unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012365 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 Py_UCS4 fillchar = ' ';
12367
Victor Stinnere9a29352011-10-01 02:14:59 +020012368 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012370
Benjamin Petersonbac79492012-01-14 13:34:47 -050012371 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372 return NULL;
12373
Victor Stinnerc4b49542011-12-11 22:44:26 +010012374 if (PyUnicode_GET_LENGTH(self) >= width)
12375 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376
Victor Stinnerc4b49542011-12-11 22:44:26 +010012377 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378}
12379
Alexander Belopolsky40018472011-02-26 01:02:56 +000012380PyObject *
12381PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382{
12383 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012384
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385 s = PyUnicode_FromObject(s);
12386 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012387 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 if (sep != NULL) {
12389 sep = PyUnicode_FromObject(sep);
12390 if (sep == NULL) {
12391 Py_DECREF(s);
12392 return NULL;
12393 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394 }
12395
Victor Stinner9310abb2011-10-05 00:59:23 +020012396 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397
12398 Py_DECREF(s);
12399 Py_XDECREF(sep);
12400 return result;
12401}
12402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012403PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012405\n\
12406Return a list of the words in S, using sep as the\n\
12407delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012408splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012409whitespace string is a separator and empty strings are\n\
12410removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411
12412static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012413unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414{
12415 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012416 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417
Martin v. Löwis18e16552006-02-15 17:27:45 +000012418 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419 return NULL;
12420
12421 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012424 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012426 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427}
12428
Thomas Wouters477c8d52006-05-27 19:21:47 +000012429PyObject *
12430PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12431{
12432 PyObject* str_obj;
12433 PyObject* sep_obj;
12434 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 int kind1, kind2, kind;
12436 void *buf1 = NULL, *buf2 = NULL;
12437 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438
12439 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012440 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012441 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012442 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012443 if (!sep_obj) {
12444 Py_DECREF(str_obj);
12445 return NULL;
12446 }
12447 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12448 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012449 Py_DECREF(str_obj);
12450 return NULL;
12451 }
12452
Victor Stinner14f8f022011-10-05 20:58:25 +020012453 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012455 kind = Py_MAX(kind1, kind2);
12456 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012458 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 if (!buf1)
12460 goto onError;
12461 buf2 = PyUnicode_DATA(sep_obj);
12462 if (kind2 != kind)
12463 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12464 if (!buf2)
12465 goto onError;
12466 len1 = PyUnicode_GET_LENGTH(str_obj);
12467 len2 = PyUnicode_GET_LENGTH(sep_obj);
12468
Benjamin Petersonead6b532011-12-20 17:23:42 -060012469 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012471 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12472 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12473 else
12474 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 break;
12476 case PyUnicode_2BYTE_KIND:
12477 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12478 break;
12479 case PyUnicode_4BYTE_KIND:
12480 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12481 break;
12482 default:
12483 assert(0);
12484 out = 0;
12485 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012486
12487 Py_DECREF(sep_obj);
12488 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 if (kind1 != kind)
12490 PyMem_Free(buf1);
12491 if (kind2 != kind)
12492 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012493
12494 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 onError:
12496 Py_DECREF(sep_obj);
12497 Py_DECREF(str_obj);
12498 if (kind1 != kind && buf1)
12499 PyMem_Free(buf1);
12500 if (kind2 != kind && buf2)
12501 PyMem_Free(buf2);
12502 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012503}
12504
12505
12506PyObject *
12507PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12508{
12509 PyObject* str_obj;
12510 PyObject* sep_obj;
12511 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 int kind1, kind2, kind;
12513 void *buf1 = NULL, *buf2 = NULL;
12514 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012515
12516 str_obj = PyUnicode_FromObject(str_in);
12517 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012519 sep_obj = PyUnicode_FromObject(sep_in);
12520 if (!sep_obj) {
12521 Py_DECREF(str_obj);
12522 return NULL;
12523 }
12524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 kind1 = PyUnicode_KIND(str_in);
12526 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012527 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 buf1 = PyUnicode_DATA(str_in);
12529 if (kind1 != kind)
12530 buf1 = _PyUnicode_AsKind(str_in, kind);
12531 if (!buf1)
12532 goto onError;
12533 buf2 = PyUnicode_DATA(sep_obj);
12534 if (kind2 != kind)
12535 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12536 if (!buf2)
12537 goto onError;
12538 len1 = PyUnicode_GET_LENGTH(str_obj);
12539 len2 = PyUnicode_GET_LENGTH(sep_obj);
12540
Benjamin Petersonead6b532011-12-20 17:23:42 -060012541 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012543 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12544 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12545 else
12546 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 break;
12548 case PyUnicode_2BYTE_KIND:
12549 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12550 break;
12551 case PyUnicode_4BYTE_KIND:
12552 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12553 break;
12554 default:
12555 assert(0);
12556 out = 0;
12557 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012558
12559 Py_DECREF(sep_obj);
12560 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 if (kind1 != kind)
12562 PyMem_Free(buf1);
12563 if (kind2 != kind)
12564 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012565
12566 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 onError:
12568 Py_DECREF(sep_obj);
12569 Py_DECREF(str_obj);
12570 if (kind1 != kind && buf1)
12571 PyMem_Free(buf1);
12572 if (kind2 != kind && buf2)
12573 PyMem_Free(buf2);
12574 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012575}
12576
12577PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012579\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012580Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012581the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012582found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012583
12584static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012585unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012586{
Victor Stinner9310abb2011-10-05 00:59:23 +020012587 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012588}
12589
12590PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012591 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012592\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012593Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012594the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012595separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012596
12597static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012598unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012599{
Victor Stinner9310abb2011-10-05 00:59:23 +020012600 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012601}
12602
Alexander Belopolsky40018472011-02-26 01:02:56 +000012603PyObject *
12604PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012605{
12606 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012607
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012608 s = PyUnicode_FromObject(s);
12609 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012610 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 if (sep != NULL) {
12612 sep = PyUnicode_FromObject(sep);
12613 if (sep == NULL) {
12614 Py_DECREF(s);
12615 return NULL;
12616 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012617 }
12618
Victor Stinner9310abb2011-10-05 00:59:23 +020012619 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012620
12621 Py_DECREF(s);
12622 Py_XDECREF(sep);
12623 return result;
12624}
12625
12626PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012627 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012628\n\
12629Return a list of the words in S, using sep as the\n\
12630delimiter string, starting at the end of the string and\n\
12631working to the front. If maxsplit is given, at most maxsplit\n\
12632splits are done. If sep is not specified, any whitespace string\n\
12633is a separator.");
12634
12635static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012636unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012637{
12638 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012639 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012640
Martin v. Löwis18e16552006-02-15 17:27:45 +000012641 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012642 return NULL;
12643
12644 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012646 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012647 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012648 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012649 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012650}
12651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012652PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654\n\
12655Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012656Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012657is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658
12659static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012660unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012662 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012663 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012665 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12666 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667 return NULL;
12668
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012669 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670}
12671
12672static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012673PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012675 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676}
12677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012678PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680\n\
12681Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012682and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683
12684static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012685unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012687 if (PyUnicode_READY(self) == -1)
12688 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012689 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690}
12691
Georg Brandlceee0772007-11-27 23:48:05 +000012692PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012694\n\
12695Return a translation table usable for str.translate().\n\
12696If there is only one argument, it must be a dictionary mapping Unicode\n\
12697ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012698Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012699If there are two arguments, they must be strings of equal length, and\n\
12700in the resulting dictionary, each character in x will be mapped to the\n\
12701character at the same position in y. If there is a third argument, it\n\
12702must be a string, whose characters will be mapped to None in the result.");
12703
12704static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012705unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012706{
12707 PyObject *x, *y = NULL, *z = NULL;
12708 PyObject *new = NULL, *key, *value;
12709 Py_ssize_t i = 0;
12710 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012711
Georg Brandlceee0772007-11-27 23:48:05 +000012712 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12713 return NULL;
12714 new = PyDict_New();
12715 if (!new)
12716 return NULL;
12717 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 int x_kind, y_kind, z_kind;
12719 void *x_data, *y_data, *z_data;
12720
Georg Brandlceee0772007-11-27 23:48:05 +000012721 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012722 if (!PyUnicode_Check(x)) {
12723 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12724 "be a string if there is a second argument");
12725 goto err;
12726 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012728 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12729 "arguments must have equal length");
12730 goto err;
12731 }
12732 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 x_kind = PyUnicode_KIND(x);
12734 y_kind = PyUnicode_KIND(y);
12735 x_data = PyUnicode_DATA(x);
12736 y_data = PyUnicode_DATA(y);
12737 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12738 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012739 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012740 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012741 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012742 if (!value) {
12743 Py_DECREF(key);
12744 goto err;
12745 }
Georg Brandlceee0772007-11-27 23:48:05 +000012746 res = PyDict_SetItem(new, key, value);
12747 Py_DECREF(key);
12748 Py_DECREF(value);
12749 if (res < 0)
12750 goto err;
12751 }
12752 /* create entries for deleting chars in z */
12753 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 z_kind = PyUnicode_KIND(z);
12755 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012756 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012758 if (!key)
12759 goto err;
12760 res = PyDict_SetItem(new, key, Py_None);
12761 Py_DECREF(key);
12762 if (res < 0)
12763 goto err;
12764 }
12765 }
12766 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 int kind;
12768 void *data;
12769
Georg Brandlceee0772007-11-27 23:48:05 +000012770 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012771 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012772 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12773 "to maketrans it must be a dict");
12774 goto err;
12775 }
12776 /* copy entries into the new dict, converting string keys to int keys */
12777 while (PyDict_Next(x, &i, &key, &value)) {
12778 if (PyUnicode_Check(key)) {
12779 /* convert string keys to integer keys */
12780 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012781 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012782 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12783 "table must be of length 1");
12784 goto err;
12785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 kind = PyUnicode_KIND(key);
12787 data = PyUnicode_DATA(key);
12788 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012789 if (!newkey)
12790 goto err;
12791 res = PyDict_SetItem(new, newkey, value);
12792 Py_DECREF(newkey);
12793 if (res < 0)
12794 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012795 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012796 /* just keep integer keys */
12797 if (PyDict_SetItem(new, key, value) < 0)
12798 goto err;
12799 } else {
12800 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12801 "be strings or integers");
12802 goto err;
12803 }
12804 }
12805 }
12806 return new;
12807 err:
12808 Py_DECREF(new);
12809 return NULL;
12810}
12811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012812PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012813 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814\n\
12815Return a copy of the string S, where all characters have been mapped\n\
12816through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012817Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012818Unmapped characters are left untouched. Characters mapped to None\n\
12819are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820
12821static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825}
12826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012827PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012830Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831
12832static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012833unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012835 if (PyUnicode_READY(self) == -1)
12836 return NULL;
12837 if (PyUnicode_IS_ASCII(self))
12838 return ascii_upper_or_lower(self, 0);
12839 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840}
12841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012842PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012843 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012845Pad a numeric string S with zeros on the left, to fill a field\n\
12846of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847
12848static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012849unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012851 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012852 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012853 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 int kind;
12855 void *data;
12856 Py_UCS4 chr;
12857
Martin v. Löwis18e16552006-02-15 17:27:45 +000012858 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859 return NULL;
12860
Benjamin Petersonbac79492012-01-14 13:34:47 -050012861 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863
Victor Stinnerc4b49542011-12-11 22:44:26 +010012864 if (PyUnicode_GET_LENGTH(self) >= width)
12865 return unicode_result_unchanged(self);
12866
12867 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868
12869 u = pad(self, fill, 0, '0');
12870
Walter Dörwald068325e2002-04-15 13:36:47 +000012871 if (u == NULL)
12872 return NULL;
12873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874 kind = PyUnicode_KIND(u);
12875 data = PyUnicode_DATA(u);
12876 chr = PyUnicode_READ(kind, data, fill);
12877
12878 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880 PyUnicode_WRITE(kind, data, 0, chr);
12881 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882 }
12883
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012884 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012885 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887
12888#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012889static PyObject *
12890unicode__decimal2ascii(PyObject *self)
12891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012893}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894#endif
12895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012896PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012899Return True if S starts with the specified prefix, False otherwise.\n\
12900With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012901With optional end, stop comparing S at that position.\n\
12902prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903
12904static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012905unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012906 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012908 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012909 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012910 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012911 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012912 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913
Jesus Ceaac451502011-04-20 17:09:23 +020012914 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012915 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012916 if (PyTuple_Check(subobj)) {
12917 Py_ssize_t i;
12918 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012919 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012920 if (substring == NULL)
12921 return NULL;
12922 result = tailmatch(self, substring, start, end, -1);
12923 Py_DECREF(substring);
12924 if (result) {
12925 Py_RETURN_TRUE;
12926 }
12927 }
12928 /* nothing matched */
12929 Py_RETURN_FALSE;
12930 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012931 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012932 if (substring == NULL) {
12933 if (PyErr_ExceptionMatches(PyExc_TypeError))
12934 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12935 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012937 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012938 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012940 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941}
12942
12943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012944PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012945 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012947Return True if S ends with the specified suffix, False otherwise.\n\
12948With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012949With optional end, stop comparing S at that position.\n\
12950suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951
12952static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012953unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012954 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012956 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012957 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012958 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012959 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012960 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961
Jesus Ceaac451502011-04-20 17:09:23 +020012962 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012963 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012964 if (PyTuple_Check(subobj)) {
12965 Py_ssize_t i;
12966 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012967 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012969 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012970 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012971 result = tailmatch(self, substring, start, end, +1);
12972 Py_DECREF(substring);
12973 if (result) {
12974 Py_RETURN_TRUE;
12975 }
12976 }
12977 Py_RETURN_FALSE;
12978 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012979 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012980 if (substring == NULL) {
12981 if (PyErr_ExceptionMatches(PyExc_TypeError))
12982 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12983 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012984 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012985 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012986 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012988 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012989}
12990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012992
12993PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012994 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012995\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012996Return a formatted version of S, using substitutions from args and kwargs.\n\
12997The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012998
Eric Smith27bbca62010-11-04 17:06:58 +000012999PyDoc_STRVAR(format_map__doc__,
13000 "S.format_map(mapping) -> str\n\
13001\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013002Return a formatted version of S, using substitutions from mapping.\n\
13003The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013004
Eric Smith4a7d76d2008-05-30 18:10:19 +000013005static PyObject *
13006unicode__format__(PyObject* self, PyObject* args)
13007{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013008 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013009
13010 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13011 return NULL;
13012
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013013 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013015 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013016}
13017
Eric Smith8c663262007-08-25 02:26:07 +000013018PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013019 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013020\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013021Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013022
13023static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013024unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 Py_ssize_t size;
13027
13028 /* If it's a compact object, account for base structure +
13029 character data. */
13030 if (PyUnicode_IS_COMPACT_ASCII(v))
13031 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13032 else if (PyUnicode_IS_COMPACT(v))
13033 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013034 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 else {
13036 /* If it is a two-block object, account for base object, and
13037 for character block if present. */
13038 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013039 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013041 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 }
13043 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013044 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013045 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013047 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013048 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013049
13050 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013051}
13052
13053PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013054 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013055
13056static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013057unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013058{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013059 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 if (!copy)
13061 return NULL;
13062 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013063}
13064
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065static PyMethodDef unicode_methods[] = {
13066
13067 /* Order is according to common usage: often used methods should
13068 appear first, since lookup is done sequentially. */
13069
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013070 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013071 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13072 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013073 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013074 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13075 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013076 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013077 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13078 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13079 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13080 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13081 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013082 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013083 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13084 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13085 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013086 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013087 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13088 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13089 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013090 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013091 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013092 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013093 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013094 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13095 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13096 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13097 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13098 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13099 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13100 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13101 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13102 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13103 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13104 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13105 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13106 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13107 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013108 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013109 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013110 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013111 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013112 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013113 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013114 {"maketrans", (PyCFunction) unicode_maketrans,
13115 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013116 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013117#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013118 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013119 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120#endif
13121
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123 {NULL, NULL}
13124};
13125
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013126static PyObject *
13127unicode_mod(PyObject *v, PyObject *w)
13128{
Brian Curtindfc80e32011-08-10 20:28:54 -050013129 if (!PyUnicode_Check(v))
13130 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013132}
13133
13134static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013135 0, /*nb_add*/
13136 0, /*nb_subtract*/
13137 0, /*nb_multiply*/
13138 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013139};
13140
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013142 (lenfunc) unicode_length, /* sq_length */
13143 PyUnicode_Concat, /* sq_concat */
13144 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13145 (ssizeargfunc) unicode_getitem, /* sq_item */
13146 0, /* sq_slice */
13147 0, /* sq_ass_item */
13148 0, /* sq_ass_slice */
13149 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150};
13151
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013152static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013153unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013154{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155 if (PyUnicode_READY(self) == -1)
13156 return NULL;
13157
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013158 if (PyIndex_Check(item)) {
13159 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013160 if (i == -1 && PyErr_Occurred())
13161 return NULL;
13162 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013164 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013165 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013166 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013167 PyObject *result;
13168 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013169 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013170 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013174 return NULL;
13175 }
13176
13177 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013178 Py_INCREF(unicode_empty);
13179 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013180 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013181 slicelength == PyUnicode_GET_LENGTH(self)) {
13182 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013183 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013184 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013185 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013186 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013187 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013188 src_kind = PyUnicode_KIND(self);
13189 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013190 if (!PyUnicode_IS_ASCII(self)) {
13191 kind_limit = kind_maxchar_limit(src_kind);
13192 max_char = 0;
13193 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13194 ch = PyUnicode_READ(src_kind, src_data, cur);
13195 if (ch > max_char) {
13196 max_char = ch;
13197 if (max_char >= kind_limit)
13198 break;
13199 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013200 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013201 }
Victor Stinner55c99112011-10-13 01:17:06 +020013202 else
13203 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013204 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013205 if (result == NULL)
13206 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013207 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013208 dest_data = PyUnicode_DATA(result);
13209
13210 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013211 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13212 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013213 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013214 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013215 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013216 } else {
13217 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13218 return NULL;
13219 }
13220}
13221
13222static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013223 (lenfunc)unicode_length, /* mp_length */
13224 (binaryfunc)unicode_subscript, /* mp_subscript */
13225 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013226};
13227
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228
Guido van Rossumd57fd912000-03-10 22:53:23 +000013229/* Helpers for PyUnicode_Format() */
13230
13231static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013232getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013234 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013236 (*p_argidx)++;
13237 if (arglen < 0)
13238 return args;
13239 else
13240 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241 }
13242 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013243 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244 return NULL;
13245}
13246
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013247/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013249static PyObject *
13250formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013252 char *p;
13253 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013255
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256 x = PyFloat_AsDouble(v);
13257 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013258 return NULL;
13259
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013261 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013262
Eric Smith0923d1d2009-04-16 20:16:10 +000013263 p = PyOS_double_to_string(x, type, prec,
13264 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013265 if (p == NULL)
13266 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013267 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013268 PyMem_Free(p);
13269 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270}
13271
Tim Peters38fd5b62000-09-21 05:43:11 +000013272static PyObject*
13273formatlong(PyObject *val, int flags, int prec, int type)
13274{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013275 char *buf;
13276 int len;
13277 PyObject *str; /* temporary string object. */
13278 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013279
Benjamin Peterson14339b62009-01-31 16:36:08 +000013280 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13281 if (!str)
13282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013283 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013284 Py_DECREF(str);
13285 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013286}
13287
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013288static Py_UCS4
13289formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013291 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013292 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013294 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013296 goto onError;
13297 }
13298 else {
13299 /* Integer input truncated to a character */
13300 long x;
13301 x = PyLong_AsLong(v);
13302 if (x == -1 && PyErr_Occurred())
13303 goto onError;
13304
Victor Stinner8faf8212011-12-08 22:14:11 +010013305 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013306 PyErr_SetString(PyExc_OverflowError,
13307 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013308 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013309 }
13310
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013311 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013312 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013313
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013315 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013316 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013317 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318}
13319
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013320static int
13321repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13322{
13323 int r;
13324 assert(count > 0);
13325 assert(PyUnicode_Check(obj));
13326 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013327 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013328 if (repeated == NULL)
13329 return -1;
13330 r = _PyAccu_Accumulate(acc, repeated);
13331 Py_DECREF(repeated);
13332 return r;
13333 }
13334 else {
13335 do {
13336 if (_PyAccu_Accumulate(acc, obj))
13337 return -1;
13338 } while (--count);
13339 return 0;
13340 }
13341}
13342
Alexander Belopolsky40018472011-02-26 01:02:56 +000013343PyObject *
13344PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 void *fmt;
13347 int fmtkind;
13348 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013350 int r;
13351 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013354 PyObject *temp = NULL;
13355 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013356 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013357 _PyAccu acc;
13358 static PyObject *plus, *minus, *blank, *zero, *percent;
13359
13360 if (!plus && !(plus = get_latin1_char('+')))
13361 return NULL;
13362 if (!minus && !(minus = get_latin1_char('-')))
13363 return NULL;
13364 if (!blank && !(blank = get_latin1_char(' ')))
13365 return NULL;
13366 if (!zero && !(zero = get_latin1_char('0')))
13367 return NULL;
13368 if (!percent && !(percent = get_latin1_char('%')))
13369 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013370
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 PyErr_BadInternalCall();
13373 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013375 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013376 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013378 if (PyUnicode_READY(uformat) == -1)
13379 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013380 if (_PyAccu_Init(&acc))
13381 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013382 fmt = PyUnicode_DATA(uformat);
13383 fmtkind = PyUnicode_KIND(uformat);
13384 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13385 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013386
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 arglen = PyTuple_Size(args);
13389 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013390 }
13391 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 arglen = -1;
13393 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013395 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013396 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398
13399 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013401 PyObject *nonfmt;
13402 Py_ssize_t nonfmtpos;
13403 nonfmtpos = fmtpos++;
13404 while (fmtcnt >= 0 &&
13405 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13406 fmtpos++;
13407 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013408 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013409 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013410 if (nonfmt == NULL)
13411 goto onError;
13412 r = _PyAccu_Accumulate(&acc, nonfmt);
13413 Py_DECREF(nonfmt);
13414 if (r)
13415 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013416 }
13417 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 /* Got a format specifier */
13419 int flags = 0;
13420 Py_ssize_t width = -1;
13421 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013422 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013423 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 int isnumok;
13425 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013426 void *pbuf = NULL;
13427 Py_ssize_t pindex, len;
13428 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013430 fmtpos++;
13431 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13432 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 Py_ssize_t keylen;
13434 PyObject *key;
13435 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013436
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 if (dict == NULL) {
13438 PyErr_SetString(PyExc_TypeError,
13439 "format requires a mapping");
13440 goto onError;
13441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013442 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013444 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 /* Skip over balanced parentheses */
13446 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013447 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013449 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013451 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013453 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 if (fmtcnt < 0 || pcount > 0) {
13455 PyErr_SetString(PyExc_ValueError,
13456 "incomplete format key");
13457 goto onError;
13458 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013459 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013460 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013461 if (key == NULL)
13462 goto onError;
13463 if (args_owned) {
13464 Py_DECREF(args);
13465 args_owned = 0;
13466 }
13467 args = PyObject_GetItem(dict, key);
13468 Py_DECREF(key);
13469 if (args == NULL) {
13470 goto onError;
13471 }
13472 args_owned = 1;
13473 arglen = -1;
13474 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013475 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013476 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 case '-': flags |= F_LJUST; continue;
13479 case '+': flags |= F_SIGN; continue;
13480 case ' ': flags |= F_BLANK; continue;
13481 case '#': flags |= F_ALT; continue;
13482 case '0': flags |= F_ZERO; continue;
13483 }
13484 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013485 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013486 if (c == '*') {
13487 v = getnextarg(args, arglen, &argidx);
13488 if (v == NULL)
13489 goto onError;
13490 if (!PyLong_Check(v)) {
13491 PyErr_SetString(PyExc_TypeError,
13492 "* wants int");
13493 goto onError;
13494 }
13495 width = PyLong_AsLong(v);
13496 if (width == -1 && PyErr_Occurred())
13497 goto onError;
13498 if (width < 0) {
13499 flags |= F_LJUST;
13500 width = -width;
13501 }
13502 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013504 }
13505 else if (c >= '0' && c <= '9') {
13506 width = c - '0';
13507 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 if (c < '0' || c > '9')
13510 break;
13511 if ((width*10) / 10 != width) {
13512 PyErr_SetString(PyExc_ValueError,
13513 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013514 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 }
13516 width = width*10 + (c - '0');
13517 }
13518 }
13519 if (c == '.') {
13520 prec = 0;
13521 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013522 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013523 if (c == '*') {
13524 v = getnextarg(args, arglen, &argidx);
13525 if (v == NULL)
13526 goto onError;
13527 if (!PyLong_Check(v)) {
13528 PyErr_SetString(PyExc_TypeError,
13529 "* wants int");
13530 goto onError;
13531 }
13532 prec = PyLong_AsLong(v);
13533 if (prec == -1 && PyErr_Occurred())
13534 goto onError;
13535 if (prec < 0)
13536 prec = 0;
13537 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013538 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013539 }
13540 else if (c >= '0' && c <= '9') {
13541 prec = c - '0';
13542 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013543 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 if (c < '0' || c > '9')
13545 break;
13546 if ((prec*10) / 10 != prec) {
13547 PyErr_SetString(PyExc_ValueError,
13548 "prec too big");
13549 goto onError;
13550 }
13551 prec = prec*10 + (c - '0');
13552 }
13553 }
13554 } /* prec */
13555 if (fmtcnt >= 0) {
13556 if (c == 'h' || c == 'l' || c == 'L') {
13557 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013558 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013559 }
13560 }
13561 if (fmtcnt < 0) {
13562 PyErr_SetString(PyExc_ValueError,
13563 "incomplete format");
13564 goto onError;
13565 }
13566 if (c != '%') {
13567 v = getnextarg(args, arglen, &argidx);
13568 if (v == NULL)
13569 goto onError;
13570 }
13571 sign = 0;
13572 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013573 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 switch (c) {
13575
13576 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013577 _PyAccu_Accumulate(&acc, percent);
13578 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013579
13580 case 's':
13581 case 'r':
13582 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013583 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 temp = v;
13585 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013586 }
13587 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 if (c == 's')
13589 temp = PyObject_Str(v);
13590 else if (c == 'r')
13591 temp = PyObject_Repr(v);
13592 else
13593 temp = PyObject_ASCII(v);
13594 if (temp == NULL)
13595 goto onError;
13596 if (PyUnicode_Check(temp))
13597 /* nothing to do */;
13598 else {
13599 Py_DECREF(temp);
13600 PyErr_SetString(PyExc_TypeError,
13601 "%s argument has non-string str()");
13602 goto onError;
13603 }
13604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013605 if (PyUnicode_READY(temp) == -1) {
13606 Py_CLEAR(temp);
13607 goto onError;
13608 }
13609 pbuf = PyUnicode_DATA(temp);
13610 kind = PyUnicode_KIND(temp);
13611 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013612 if (prec >= 0 && len > prec)
13613 len = prec;
13614 break;
13615
13616 case 'i':
13617 case 'd':
13618 case 'u':
13619 case 'o':
13620 case 'x':
13621 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013622 isnumok = 0;
13623 if (PyNumber_Check(v)) {
13624 PyObject *iobj=NULL;
13625
13626 if (PyLong_Check(v)) {
13627 iobj = v;
13628 Py_INCREF(iobj);
13629 }
13630 else {
13631 iobj = PyNumber_Long(v);
13632 }
13633 if (iobj!=NULL) {
13634 if (PyLong_Check(iobj)) {
13635 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013636 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013637 Py_DECREF(iobj);
13638 if (!temp)
13639 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013640 if (PyUnicode_READY(temp) == -1) {
13641 Py_CLEAR(temp);
13642 goto onError;
13643 }
13644 pbuf = PyUnicode_DATA(temp);
13645 kind = PyUnicode_KIND(temp);
13646 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013647 sign = 1;
13648 }
13649 else {
13650 Py_DECREF(iobj);
13651 }
13652 }
13653 }
13654 if (!isnumok) {
13655 PyErr_Format(PyExc_TypeError,
13656 "%%%c format: a number is required, "
13657 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13658 goto onError;
13659 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013660 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013661 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013662 fillobj = zero;
13663 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 break;
13665
13666 case 'e':
13667 case 'E':
13668 case 'f':
13669 case 'F':
13670 case 'g':
13671 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013672 temp = formatfloat(v, flags, prec, c);
13673 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013674 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675 if (PyUnicode_READY(temp) == -1) {
13676 Py_CLEAR(temp);
13677 goto onError;
13678 }
13679 pbuf = PyUnicode_DATA(temp);
13680 kind = PyUnicode_KIND(temp);
13681 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013682 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013683 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013684 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013685 fillobj = zero;
13686 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013687 break;
13688
13689 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013690 {
13691 Py_UCS4 ch = formatchar(v);
13692 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013693 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013694 temp = _PyUnicode_FromUCS4(&ch, 1);
13695 if (temp == NULL)
13696 goto onError;
13697 pbuf = PyUnicode_DATA(temp);
13698 kind = PyUnicode_KIND(temp);
13699 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013700 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013701 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013702
13703 default:
13704 PyErr_Format(PyExc_ValueError,
13705 "unsupported format character '%c' (0x%x) "
13706 "at index %zd",
13707 (31<=c && c<=126) ? (char)c : '?',
13708 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013709 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013710 goto onError;
13711 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013712 /* pbuf is initialized here. */
13713 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013715 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13716 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013718 pindex++;
13719 }
13720 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13721 signobj = plus;
13722 len--;
13723 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013724 }
13725 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013726 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013728 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 else
13730 sign = 0;
13731 }
13732 if (width < len)
13733 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013734 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013735 if (fill != ' ') {
13736 assert(signobj != NULL);
13737 if (_PyAccu_Accumulate(&acc, signobj))
13738 goto onError;
13739 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 if (width > len)
13741 width--;
13742 }
13743 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013744 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013745 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013746 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013747 second = get_latin1_char(
13748 PyUnicode_READ(kind, pbuf, pindex + 1));
13749 pindex += 2;
13750 if (second == NULL ||
13751 _PyAccu_Accumulate(&acc, zero) ||
13752 _PyAccu_Accumulate(&acc, second))
13753 goto onError;
13754 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013756 width -= 2;
13757 if (width < 0)
13758 width = 0;
13759 len -= 2;
13760 }
13761 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013762 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013763 if (repeat_accumulate(&acc, fillobj, width - len))
13764 goto onError;
13765 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013766 }
13767 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013768 if (sign) {
13769 assert(signobj != NULL);
13770 if (_PyAccu_Accumulate(&acc, signobj))
13771 goto onError;
13772 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013773 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013774 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13775 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013776 second = get_latin1_char(
13777 PyUnicode_READ(kind, pbuf, pindex + 1));
13778 pindex += 2;
13779 if (second == NULL ||
13780 _PyAccu_Accumulate(&acc, zero) ||
13781 _PyAccu_Accumulate(&acc, second))
13782 goto onError;
13783 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013784 }
13785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013786 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013787 if (temp != NULL) {
13788 assert(pbuf == PyUnicode_DATA(temp));
13789 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013791 else {
13792 const char *p = (const char *) pbuf;
13793 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013794 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013795 v = PyUnicode_FromKindAndData(kind, p, len);
13796 }
13797 if (v == NULL)
13798 goto onError;
13799 r = _PyAccu_Accumulate(&acc, v);
13800 Py_DECREF(v);
13801 if (r)
13802 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013803 if (width > len && repeat_accumulate(&acc, blank, width - len))
13804 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 if (dict && (argidx < arglen) && c != '%') {
13806 PyErr_SetString(PyExc_TypeError,
13807 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 goto onError;
13809 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013810 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013812 } /* until end */
13813 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 PyErr_SetString(PyExc_TypeError,
13815 "not all arguments converted during string formatting");
13816 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013817 }
13818
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013819 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822 }
13823 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013824 Py_XDECREF(temp);
13825 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013826 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013827
Benjamin Peterson29060642009-01-31 22:14:21 +000013828 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013829 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013830 Py_XDECREF(temp);
13831 Py_XDECREF(second);
13832 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013833 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013834 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835 }
13836 return NULL;
13837}
13838
Jeremy Hylton938ace62002-07-17 16:30:39 +000013839static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013840unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13841
Tim Peters6d6c1a32001-08-02 04:15:00 +000013842static PyObject *
13843unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13844{
Benjamin Peterson29060642009-01-31 22:14:21 +000013845 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013846 static char *kwlist[] = {"object", "encoding", "errors", 0};
13847 char *encoding = NULL;
13848 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013849
Benjamin Peterson14339b62009-01-31 16:36:08 +000013850 if (type != &PyUnicode_Type)
13851 return unicode_subtype_new(type, args, kwds);
13852 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013853 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013854 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013855 if (x == NULL) {
13856 Py_INCREF(unicode_empty);
13857 return unicode_empty;
13858 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013859 if (encoding == NULL && errors == NULL)
13860 return PyObject_Str(x);
13861 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013862 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013863}
13864
Guido van Rossume023fe02001-08-30 03:12:59 +000013865static PyObject *
13866unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13867{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013868 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013869 Py_ssize_t length, char_size;
13870 int share_wstr, share_utf8;
13871 unsigned int kind;
13872 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013873
Benjamin Peterson14339b62009-01-31 16:36:08 +000013874 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013875
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013876 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013877 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013878 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013879 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013880 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013881 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013882 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013883 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013884
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013885 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013886 if (self == NULL) {
13887 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013888 return NULL;
13889 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013890 kind = PyUnicode_KIND(unicode);
13891 length = PyUnicode_GET_LENGTH(unicode);
13892
13893 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013894#ifdef Py_DEBUG
13895 _PyUnicode_HASH(self) = -1;
13896#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013897 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013898#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013899 _PyUnicode_STATE(self).interned = 0;
13900 _PyUnicode_STATE(self).kind = kind;
13901 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013902 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013903 _PyUnicode_STATE(self).ready = 1;
13904 _PyUnicode_WSTR(self) = NULL;
13905 _PyUnicode_UTF8_LENGTH(self) = 0;
13906 _PyUnicode_UTF8(self) = NULL;
13907 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013908 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013909
13910 share_utf8 = 0;
13911 share_wstr = 0;
13912 if (kind == PyUnicode_1BYTE_KIND) {
13913 char_size = 1;
13914 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13915 share_utf8 = 1;
13916 }
13917 else if (kind == PyUnicode_2BYTE_KIND) {
13918 char_size = 2;
13919 if (sizeof(wchar_t) == 2)
13920 share_wstr = 1;
13921 }
13922 else {
13923 assert(kind == PyUnicode_4BYTE_KIND);
13924 char_size = 4;
13925 if (sizeof(wchar_t) == 4)
13926 share_wstr = 1;
13927 }
13928
13929 /* Ensure we won't overflow the length. */
13930 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13931 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013932 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013933 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013934 data = PyObject_MALLOC((length + 1) * char_size);
13935 if (data == NULL) {
13936 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013937 goto onError;
13938 }
13939
Victor Stinnerc3c74152011-10-02 20:39:55 +020013940 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013941 if (share_utf8) {
13942 _PyUnicode_UTF8_LENGTH(self) = length;
13943 _PyUnicode_UTF8(self) = data;
13944 }
13945 if (share_wstr) {
13946 _PyUnicode_WSTR_LENGTH(self) = length;
13947 _PyUnicode_WSTR(self) = (wchar_t *)data;
13948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013950 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013951 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013952 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013953#ifdef Py_DEBUG
13954 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13955#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013956 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013957 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013958
13959onError:
13960 Py_DECREF(unicode);
13961 Py_DECREF(self);
13962 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013963}
13964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013965PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013966 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013967\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013968Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013969encoding defaults to the current default string encoding.\n\
13970errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013971
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013972static PyObject *unicode_iter(PyObject *seq);
13973
Guido van Rossumd57fd912000-03-10 22:53:23 +000013974PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013975 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 "str", /* tp_name */
13977 sizeof(PyUnicodeObject), /* tp_size */
13978 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013979 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 (destructor)unicode_dealloc, /* tp_dealloc */
13981 0, /* tp_print */
13982 0, /* tp_getattr */
13983 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013984 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 unicode_repr, /* tp_repr */
13986 &unicode_as_number, /* tp_as_number */
13987 &unicode_as_sequence, /* tp_as_sequence */
13988 &unicode_as_mapping, /* tp_as_mapping */
13989 (hashfunc) unicode_hash, /* tp_hash*/
13990 0, /* tp_call*/
13991 (reprfunc) unicode_str, /* tp_str */
13992 PyObject_GenericGetAttr, /* tp_getattro */
13993 0, /* tp_setattro */
13994 0, /* tp_as_buffer */
13995 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013996 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013997 unicode_doc, /* tp_doc */
13998 0, /* tp_traverse */
13999 0, /* tp_clear */
14000 PyUnicode_RichCompare, /* tp_richcompare */
14001 0, /* tp_weaklistoffset */
14002 unicode_iter, /* tp_iter */
14003 0, /* tp_iternext */
14004 unicode_methods, /* tp_methods */
14005 0, /* tp_members */
14006 0, /* tp_getset */
14007 &PyBaseObject_Type, /* tp_base */
14008 0, /* tp_dict */
14009 0, /* tp_descr_get */
14010 0, /* tp_descr_set */
14011 0, /* tp_dictoffset */
14012 0, /* tp_init */
14013 0, /* tp_alloc */
14014 unicode_new, /* tp_new */
14015 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014016};
14017
14018/* Initialize the Unicode implementation */
14019
Victor Stinner3a50e702011-10-18 21:21:00 +020014020int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014021{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014022 int i;
14023
Thomas Wouters477c8d52006-05-27 19:21:47 +000014024 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014025 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014026 0x000A, /* LINE FEED */
14027 0x000D, /* CARRIAGE RETURN */
14028 0x001C, /* FILE SEPARATOR */
14029 0x001D, /* GROUP SEPARATOR */
14030 0x001E, /* RECORD SEPARATOR */
14031 0x0085, /* NEXT LINE */
14032 0x2028, /* LINE SEPARATOR */
14033 0x2029, /* PARAGRAPH SEPARATOR */
14034 };
14035
Fred Drakee4315f52000-05-09 19:53:39 +000014036 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014037 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014038 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014039 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014040 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014041
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014042 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014043 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014044 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014045 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014046
14047 /* initialize the linebreak bloom filter */
14048 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014049 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014050 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014051
14052 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014053
14054#ifdef HAVE_MBCS
14055 winver.dwOSVersionInfoSize = sizeof(winver);
14056 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14057 PyErr_SetFromWindowsErr(0);
14058 return -1;
14059 }
14060#endif
14061 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014062}
14063
14064/* Finalize the Unicode implementation */
14065
Christian Heimesa156e092008-02-16 07:38:31 +000014066int
14067PyUnicode_ClearFreeList(void)
14068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014069 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014070}
14071
Guido van Rossumd57fd912000-03-10 22:53:23 +000014072void
Thomas Wouters78890102000-07-22 19:25:51 +000014073_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014074{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014075 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014076
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014077 Py_XDECREF(unicode_empty);
14078 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014079
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014080 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014081 if (unicode_latin1[i]) {
14082 Py_DECREF(unicode_latin1[i]);
14083 unicode_latin1[i] = NULL;
14084 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014085 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014086 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014087 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014088}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014089
Walter Dörwald16807132007-05-25 13:52:07 +000014090void
14091PyUnicode_InternInPlace(PyObject **p)
14092{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014093 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014094 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014095#ifdef Py_DEBUG
14096 assert(s != NULL);
14097 assert(_PyUnicode_CHECK(s));
14098#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014100 return;
14101#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014102 /* If it's a subclass, we don't really know what putting
14103 it in the interned dict might do. */
14104 if (!PyUnicode_CheckExact(s))
14105 return;
14106 if (PyUnicode_CHECK_INTERNED(s))
14107 return;
14108 if (interned == NULL) {
14109 interned = PyDict_New();
14110 if (interned == NULL) {
14111 PyErr_Clear(); /* Don't leave an exception */
14112 return;
14113 }
14114 }
14115 /* It might be that the GetItem call fails even
14116 though the key is present in the dictionary,
14117 namely when this happens during a stack overflow. */
14118 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014119 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014120 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014121
Benjamin Peterson29060642009-01-31 22:14:21 +000014122 if (t) {
14123 Py_INCREF(t);
14124 Py_DECREF(*p);
14125 *p = t;
14126 return;
14127 }
Walter Dörwald16807132007-05-25 13:52:07 +000014128
Benjamin Peterson14339b62009-01-31 16:36:08 +000014129 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014130 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014131 PyErr_Clear();
14132 PyThreadState_GET()->recursion_critical = 0;
14133 return;
14134 }
14135 PyThreadState_GET()->recursion_critical = 0;
14136 /* The two references in interned are not counted by refcnt.
14137 The deallocator will take care of this */
14138 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014139 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014140}
14141
14142void
14143PyUnicode_InternImmortal(PyObject **p)
14144{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014145 PyUnicode_InternInPlace(p);
14146 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014147 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014148 Py_INCREF(*p);
14149 }
Walter Dörwald16807132007-05-25 13:52:07 +000014150}
14151
14152PyObject *
14153PyUnicode_InternFromString(const char *cp)
14154{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 PyObject *s = PyUnicode_FromString(cp);
14156 if (s == NULL)
14157 return NULL;
14158 PyUnicode_InternInPlace(&s);
14159 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014160}
14161
Alexander Belopolsky40018472011-02-26 01:02:56 +000014162void
14163_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014164{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014165 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014166 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 Py_ssize_t i, n;
14168 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014169
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 if (interned == NULL || !PyDict_Check(interned))
14171 return;
14172 keys = PyDict_Keys(interned);
14173 if (keys == NULL || !PyList_Check(keys)) {
14174 PyErr_Clear();
14175 return;
14176 }
Walter Dörwald16807132007-05-25 13:52:07 +000014177
Benjamin Peterson14339b62009-01-31 16:36:08 +000014178 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14179 detector, interned unicode strings are not forcibly deallocated;
14180 rather, we give them their stolen references back, and then clear
14181 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014182
Benjamin Peterson14339b62009-01-31 16:36:08 +000014183 n = PyList_GET_SIZE(keys);
14184 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014185 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014186 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014187 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014188 if (PyUnicode_READY(s) == -1) {
14189 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014190 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014192 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 case SSTATE_NOT_INTERNED:
14194 /* XXX Shouldn't happen */
14195 break;
14196 case SSTATE_INTERNED_IMMORTAL:
14197 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014198 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014199 break;
14200 case SSTATE_INTERNED_MORTAL:
14201 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014202 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014203 break;
14204 default:
14205 Py_FatalError("Inconsistent interned string state.");
14206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014207 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014208 }
14209 fprintf(stderr, "total size of all interned strings: "
14210 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14211 "mortal/immortal\n", mortal_size, immortal_size);
14212 Py_DECREF(keys);
14213 PyDict_Clear(interned);
14214 Py_DECREF(interned);
14215 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014216}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014217
14218
14219/********************* Unicode Iterator **************************/
14220
14221typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014222 PyObject_HEAD
14223 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014224 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014225} unicodeiterobject;
14226
14227static void
14228unicodeiter_dealloc(unicodeiterobject *it)
14229{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014230 _PyObject_GC_UNTRACK(it);
14231 Py_XDECREF(it->it_seq);
14232 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014233}
14234
14235static int
14236unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14237{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014238 Py_VISIT(it->it_seq);
14239 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014240}
14241
14242static PyObject *
14243unicodeiter_next(unicodeiterobject *it)
14244{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014245 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014246
Benjamin Peterson14339b62009-01-31 16:36:08 +000014247 assert(it != NULL);
14248 seq = it->it_seq;
14249 if (seq == NULL)
14250 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014251 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014253 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14254 int kind = PyUnicode_KIND(seq);
14255 void *data = PyUnicode_DATA(seq);
14256 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14257 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014258 if (item != NULL)
14259 ++it->it_index;
14260 return item;
14261 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014262
Benjamin Peterson14339b62009-01-31 16:36:08 +000014263 Py_DECREF(seq);
14264 it->it_seq = NULL;
14265 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014266}
14267
14268static PyObject *
14269unicodeiter_len(unicodeiterobject *it)
14270{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 Py_ssize_t len = 0;
14272 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014273 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014274 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014275}
14276
14277PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14278
14279static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014280 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014281 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014282 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014283};
14284
14285PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14287 "str_iterator", /* tp_name */
14288 sizeof(unicodeiterobject), /* tp_basicsize */
14289 0, /* tp_itemsize */
14290 /* methods */
14291 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14292 0, /* tp_print */
14293 0, /* tp_getattr */
14294 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014295 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014296 0, /* tp_repr */
14297 0, /* tp_as_number */
14298 0, /* tp_as_sequence */
14299 0, /* tp_as_mapping */
14300 0, /* tp_hash */
14301 0, /* tp_call */
14302 0, /* tp_str */
14303 PyObject_GenericGetAttr, /* tp_getattro */
14304 0, /* tp_setattro */
14305 0, /* tp_as_buffer */
14306 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14307 0, /* tp_doc */
14308 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14309 0, /* tp_clear */
14310 0, /* tp_richcompare */
14311 0, /* tp_weaklistoffset */
14312 PyObject_SelfIter, /* tp_iter */
14313 (iternextfunc)unicodeiter_next, /* tp_iternext */
14314 unicodeiter_methods, /* tp_methods */
14315 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014316};
14317
14318static PyObject *
14319unicode_iter(PyObject *seq)
14320{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014321 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014322
Benjamin Peterson14339b62009-01-31 16:36:08 +000014323 if (!PyUnicode_Check(seq)) {
14324 PyErr_BadInternalCall();
14325 return NULL;
14326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014327 if (PyUnicode_READY(seq) == -1)
14328 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014329 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14330 if (it == NULL)
14331 return NULL;
14332 it->it_index = 0;
14333 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014334 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014335 _PyObject_GC_TRACK(it);
14336 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014337}
14338
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014339
14340size_t
14341Py_UNICODE_strlen(const Py_UNICODE *u)
14342{
14343 int res = 0;
14344 while(*u++)
14345 res++;
14346 return res;
14347}
14348
14349Py_UNICODE*
14350Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14351{
14352 Py_UNICODE *u = s1;
14353 while ((*u++ = *s2++));
14354 return s1;
14355}
14356
14357Py_UNICODE*
14358Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14359{
14360 Py_UNICODE *u = s1;
14361 while ((*u++ = *s2++))
14362 if (n-- == 0)
14363 break;
14364 return s1;
14365}
14366
14367Py_UNICODE*
14368Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14369{
14370 Py_UNICODE *u1 = s1;
14371 u1 += Py_UNICODE_strlen(u1);
14372 Py_UNICODE_strcpy(u1, s2);
14373 return s1;
14374}
14375
14376int
14377Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14378{
14379 while (*s1 && *s2 && *s1 == *s2)
14380 s1++, s2++;
14381 if (*s1 && *s2)
14382 return (*s1 < *s2) ? -1 : +1;
14383 if (*s1)
14384 return 1;
14385 if (*s2)
14386 return -1;
14387 return 0;
14388}
14389
14390int
14391Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14392{
14393 register Py_UNICODE u1, u2;
14394 for (; n != 0; n--) {
14395 u1 = *s1;
14396 u2 = *s2;
14397 if (u1 != u2)
14398 return (u1 < u2) ? -1 : +1;
14399 if (u1 == '\0')
14400 return 0;
14401 s1++;
14402 s2++;
14403 }
14404 return 0;
14405}
14406
14407Py_UNICODE*
14408Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14409{
14410 const Py_UNICODE *p;
14411 for (p = s; *p; p++)
14412 if (*p == c)
14413 return (Py_UNICODE*)p;
14414 return NULL;
14415}
14416
14417Py_UNICODE*
14418Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14419{
14420 const Py_UNICODE *p;
14421 p = s + Py_UNICODE_strlen(s);
14422 while (p != s) {
14423 p--;
14424 if (*p == c)
14425 return (Py_UNICODE*)p;
14426 }
14427 return NULL;
14428}
Victor Stinner331ea922010-08-10 16:37:20 +000014429
Victor Stinner71133ff2010-09-01 23:43:53 +000014430Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014431PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014432{
Victor Stinner577db2c2011-10-11 22:12:48 +020014433 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014434 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014436 if (!PyUnicode_Check(unicode)) {
14437 PyErr_BadArgument();
14438 return NULL;
14439 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014440 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014441 if (u == NULL)
14442 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014443 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014444 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014445 PyErr_NoMemory();
14446 return NULL;
14447 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014448 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014449 size *= sizeof(Py_UNICODE);
14450 copy = PyMem_Malloc(size);
14451 if (copy == NULL) {
14452 PyErr_NoMemory();
14453 return NULL;
14454 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014455 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014456 return copy;
14457}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014458
Georg Brandl66c221e2010-10-14 07:04:07 +000014459/* A _string module, to export formatter_parser and formatter_field_name_split
14460 to the string.Formatter class implemented in Python. */
14461
14462static PyMethodDef _string_methods[] = {
14463 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14464 METH_O, PyDoc_STR("split the argument as a field name")},
14465 {"formatter_parser", (PyCFunction) formatter_parser,
14466 METH_O, PyDoc_STR("parse the argument as a format string")},
14467 {NULL, NULL}
14468};
14469
14470static struct PyModuleDef _string_module = {
14471 PyModuleDef_HEAD_INIT,
14472 "_string",
14473 PyDoc_STR("string helper module"),
14474 0,
14475 _string_methods,
14476 NULL,
14477 NULL,
14478 NULL,
14479 NULL
14480};
14481
14482PyMODINIT_FUNC
14483PyInit__string(void)
14484{
14485 return PyModule_Create(&_string_module);
14486}
14487
14488
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014489#ifdef __cplusplus
14490}
14491#endif