blob: 75e9923f0d7765cea9d677364aea0a78d1d2e691 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinner910337b2011-10-03 03:20:16 +0200114#undef PyUnicode_READY
115#define PyUnicode_READY(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200118 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100119 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200120
Victor Stinnerc379ead2011-10-03 12:52:27 +0200121#define _PyUnicode_SHARE_UTF8(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
124 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
125#define _PyUnicode_SHARE_WSTR(op) \
126 (assert(_PyUnicode_CHECK(op)), \
127 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
128
Victor Stinner829c0ad2011-10-03 01:08:02 +0200129/* true if the Unicode object has an allocated UTF-8 memory block
130 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_HAS_UTF8_MEMORY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (!PyUnicode_IS_COMPACT_ASCII(op) \
134 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
136
Victor Stinner03490912011-10-03 23:45:12 +0200137/* true if the Unicode object has an allocated wstr memory block
138 (not shared with other data) */
139#define _PyUnicode_HAS_WSTR_MEMORY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(op) && \
142 (!PyUnicode_IS_READY(op) || \
143 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
144
Victor Stinner910337b2011-10-03 03:20:16 +0200145/* Generic helper macro to convert characters of different types.
146 from_type and to_type have to be valid type names, begin and end
147 are pointers to the source characters which should be of type
148 "from_type *". to is a pointer of type "to_type *" and points to the
149 buffer where the result characters are written to. */
150#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
151 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 to_type *_to = (to_type *) to; \
153 const from_type *_iter = (begin); \
154 const from_type *_end = (end); \
155 Py_ssize_t n = (_end) - (_iter); \
156 const from_type *_unrolled_end = \
157 _iter + (n & ~ (Py_ssize_t) 3); \
158 while (_iter < (_unrolled_end)) { \
159 _to[0] = (to_type) _iter[0]; \
160 _to[1] = (to_type) _iter[1]; \
161 _to[2] = (to_type) _iter[2]; \
162 _to[3] = (to_type) _iter[3]; \
163 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200165 while (_iter < (_end)) \
166 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Walter Dörwald16807132007-05-25 13:52:07 +0000169/* This dictionary holds all interned unicode strings. Note that references
170 to strings in this dictionary are *not* counted in the string's ob_refcnt.
171 When the interned string reaches a refcnt of 0 the string deallocation
172 function will delete the reference from this dictionary.
173
174 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000175 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000176*/
177static PyObject *interned;
178
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200180static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200182/* List of static strings. */
183static _Py_Identifier *static_strings;
184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200223static void copy_characters(
224 PyObject *to, Py_ssize_t to_start,
225 PyObject *from, Py_ssize_t from_start,
226 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100227static int unicode_modifiable(PyObject *unicode);
228
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500494 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100501 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100649 assert(PyUnicode_IS_COMPACT(unicode));
650
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200651 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100652 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 struct_size = sizeof(PyASCIIObject);
654 else
655 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200656 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100669 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
Benjamin Petersonbac79492012-01-14 13:34:47 -0500767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100837 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000838 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100839 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841
Jeremy Hyltond8082792003-09-16 19:41:39 +0000842 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000843 * the caller fails before initializing str -- unicode_resize()
844 * reads str[0], and the Keep-Alive optimization can keep memory
845 * allocated for str alive across a call to unicode_dealloc(unicode).
846 * We don't want unicode_resize to read uninitialized memory in
847 * that case.
848 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849 _PyUnicode_WSTR(unicode)[0] = 0;
850 _PyUnicode_WSTR(unicode)[length] = 0;
851 _PyUnicode_WSTR_LENGTH(unicode) = length;
852 _PyUnicode_HASH(unicode) = -1;
853 _PyUnicode_STATE(unicode).interned = 0;
854 _PyUnicode_STATE(unicode).kind = 0;
855 _PyUnicode_STATE(unicode).compact = 0;
856 _PyUnicode_STATE(unicode).ready = 0;
857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200858 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200860 _PyUnicode_UTF8(unicode) = NULL;
861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100862 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 return unicode;
864}
865
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866static const char*
867unicode_kind_name(PyObject *unicode)
868{
Victor Stinner42dfd712011-10-03 14:41:45 +0200869 /* don't check consistency: unicode_kind_name() is called from
870 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 if (!PyUnicode_IS_COMPACT(unicode))
872 {
873 if (!PyUnicode_IS_READY(unicode))
874 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600875 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 {
877 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200878 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200879 return "legacy ascii";
880 else
881 return "legacy latin1";
882 case PyUnicode_2BYTE_KIND:
883 return "legacy UCS2";
884 case PyUnicode_4BYTE_KIND:
885 return "legacy UCS4";
886 default:
887 return "<legacy invalid kind>";
888 }
889 }
890 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
Benjamin Petersonbac79492012-01-14 13:34:47 -05001266 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001267 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001268 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001747 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1748 strlen(id->string),
1749 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750 if (!id->object)
1751 return NULL;
1752 PyUnicode_InternInPlace(&id->object);
1753 assert(!id->next);
1754 id->next = static_strings;
1755 static_strings = id;
1756 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001757 return id->object;
1758}
1759
1760void
1761_PyUnicode_ClearStaticStrings()
1762{
1763 _Py_Identifier *i;
1764 for (i = static_strings; i; i = i->next) {
1765 Py_DECREF(i->object);
1766 i->object = NULL;
1767 i->next = NULL;
1768 }
1769}
1770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771/* Internal function, don't check maximum character */
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001775{
Victor Stinner785938e2011-12-11 20:09:03 +01001776 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001777 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001778#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001781 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001782 }
Victor Stinner785938e2011-12-11 20:09:03 +01001783 unicode = PyUnicode_New(size, 127);
1784 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001785 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001786 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1787 assert(_PyUnicode_CheckConsistency(unicode, 1));
1788 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001789}
1790
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001791static Py_UCS4
1792kind_maxchar_limit(unsigned int kind)
1793{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001794 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001795 case PyUnicode_1BYTE_KIND:
1796 return 0x80;
1797 case PyUnicode_2BYTE_KIND:
1798 return 0x100;
1799 case PyUnicode_4BYTE_KIND:
1800 return 0x10000;
1801 default:
1802 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001803 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804 }
1805}
1806
Victor Stinner702c7342011-10-05 13:50:52 +02001807static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001808_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001811 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 if (size == 0) {
1814 Py_INCREF(unicode_empty);
1815 return unicode_empty;
1816 }
1817 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001818 if (size == 1)
1819 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
1825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001826 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001828}
1829
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830static PyObject*
1831_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832{
1833 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 if (size == 0) {
1837 Py_INCREF(unicode_empty);
1838 return unicode_empty;
1839 }
1840 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001841 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001842 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 else {
1851 _PyUnicode_CONVERT_BYTES(
1852 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 return res;
1856}
1857
Victor Stinnere57b1c02011-09-28 22:20:48 +02001858static PyObject*
1859_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860{
1861 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 if (size == 0) {
1865 Py_INCREF(unicode_empty);
1866 return unicode_empty;
1867 }
1868 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001869 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 return get_latin1_char((unsigned char)u[0]);
1871
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001873 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 if (!res)
1875 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001876 if (max_char < 256)
1877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1878 PyUnicode_1BYTE_DATA(res));
1879 else if (max_char < 0x10000)
1880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1881 PyUnicode_2BYTE_DATA(res));
1882 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001884 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 return res;
1886}
1887
1888PyObject*
1889PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1890{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001891 if (size < 0) {
1892 PyErr_SetString(PyExc_ValueError, "size must be positive");
1893 return NULL;
1894 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001895 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001897 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 PyErr_SetString(PyExc_SystemError, "invalid kind");
1904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906}
1907
Victor Stinner25a4b292011-10-06 12:31:55 +02001908/* Ensure that a string uses the most efficient storage, if it is not the
1909 case: create a new string with of the right kind. Write NULL into *p_unicode
1910 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001911static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001912unicode_adjust_maxchar(PyObject **p_unicode)
1913{
1914 PyObject *unicode, *copy;
1915 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001916 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001917 unsigned int kind;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921 assert(PyUnicode_IS_READY(unicode));
1922 if (PyUnicode_IS_ASCII(unicode))
1923 return;
1924
1925 len = PyUnicode_GET_LENGTH(unicode);
1926 kind = PyUnicode_KIND(unicode);
1927 if (kind == PyUnicode_1BYTE_KIND) {
1928 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + len);
1930 if (max_char >= 128)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else if (kind == PyUnicode_2BYTE_KIND) {
1934 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + len);
1936 if (max_char >= 256)
1937 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001938 }
1939 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs4lib_find_max_char(u, u + len);
1943 if (max_char >= 0x10000)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 copy = PyUnicode_New(len, max_char);
1947 copy_characters(copy, 0, unicode, 0, len);
1948 Py_DECREF(unicode);
1949 *p_unicode = copy;
1950}
1951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001953_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954{
Victor Stinner87af4f22011-11-21 23:03:47 +01001955 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001957
Victor Stinner034f6cf2011-09-30 02:26:44 +02001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05001962 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001963 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 length = PyUnicode_GET_LENGTH(unicode);
1966 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 if (!copy)
1968 return NULL;
1969 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1970
Victor Stinner87af4f22011-11-21 23:03:47 +01001971 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1972 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001975}
1976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978/* Widen Unicode objects to larger buffers. Don't write terminating null
1979 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
1981void*
1982_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1983{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001984 Py_ssize_t len;
1985 void *result;
1986 unsigned int skind;
1987
Benjamin Petersonbac79492012-01-14 13:34:47 -05001988 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02001989 return NULL;
1990
1991 len = PyUnicode_GET_LENGTH(s);
1992 skind = PyUnicode_KIND(s);
1993 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001994 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return NULL;
1996 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001997 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 case PyUnicode_2BYTE_KIND:
1999 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2000 if (!result)
2001 return PyErr_NoMemory();
2002 assert(skind == PyUnicode_1BYTE_KIND);
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS1, Py_UCS2,
2005 PyUnicode_1BYTE_DATA(s),
2006 PyUnicode_1BYTE_DATA(s) + len,
2007 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_4BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 if (skind == PyUnicode_2BYTE_KIND) {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS4,
2016 PyUnicode_2BYTE_DATA(s),
2017 PyUnicode_2BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 else {
2021 assert(skind == PyUnicode_1BYTE_KIND);
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS1, Py_UCS4,
2024 PyUnicode_1BYTE_DATA(s),
2025 PyUnicode_1BYTE_DATA(s) + len,
2026 result);
2027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 default:
2030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 }
Victor Stinner01698042011-10-04 00:04:26 +02002032 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return NULL;
2034}
2035
2036static Py_UCS4*
2037as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2038 int copy_null)
2039{
2040 int kind;
2041 void *data;
2042 Py_ssize_t len, targetlen;
2043 if (PyUnicode_READY(string) == -1)
2044 return NULL;
2045 kind = PyUnicode_KIND(string);
2046 data = PyUnicode_DATA(string);
2047 len = PyUnicode_GET_LENGTH(string);
2048 targetlen = len;
2049 if (copy_null)
2050 targetlen++;
2051 if (!target) {
2052 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2053 PyErr_NoMemory();
2054 return NULL;
2055 }
2056 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2057 if (!target) {
2058 PyErr_NoMemory();
2059 return NULL;
2060 }
2061 }
2062 else {
2063 if (targetsize < targetlen) {
2064 PyErr_Format(PyExc_SystemError,
2065 "string is longer than the buffer");
2066 if (copy_null && 0 < targetsize)
2067 target[0] = 0;
2068 return NULL;
2069 }
2070 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002071 if (kind == PyUnicode_1BYTE_KIND) {
2072 Py_UCS1 *start = (Py_UCS1 *) data;
2073 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002075 else if (kind == PyUnicode_2BYTE_KIND) {
2076 Py_UCS2 *start = (Py_UCS2 *) data;
2077 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2078 }
2079 else {
2080 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (copy_null)
2084 target[len] = 0;
2085 return target;
2086}
2087
2088Py_UCS4*
2089PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002092 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
2096 return as_ucs4(string, target, targetsize, copy_null);
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4Copy(PyObject *string)
2101{
2102 return as_ucs4(string, NULL, 0, 1);
2103}
2104
2105#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002106
Alexander Belopolsky40018472011-02-26 01:02:56 +00002107PyObject *
2108PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002111 if (size == 0) {
2112 Py_INCREF(unicode_empty);
2113 return unicode_empty;
2114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_BadInternalCall();
2116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == -1) {
2120 size = wcslen(w);
2121 }
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002127
Walter Dörwald346737f2007-05-31 10:44:43 +00002128static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002129makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2130 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 *fmt++ = '%';
2133 if (width) {
2134 if (zeropad)
2135 *fmt++ = '0';
2136 fmt += sprintf(fmt, "%d", width);
2137 }
2138 if (precision)
2139 fmt += sprintf(fmt, ".%d", precision);
2140 if (longflag)
2141 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142 else if (longlongflag) {
2143 /* longlongflag should only ever be nonzero on machines with
2144 HAVE_LONG_LONG defined */
2145#ifdef HAVE_LONG_LONG
2146 char *f = PY_FORMAT_LONG_LONG;
2147 while (*f)
2148 *fmt++ = *f++;
2149#else
2150 /* we shouldn't ever get here */
2151 assert(0);
2152 *fmt++ = 'l';
2153#endif
2154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 else if (size_tflag) {
2156 char *f = PY_FORMAT_SIZE_T;
2157 while (*f)
2158 *fmt++ = *f++;
2159 }
2160 *fmt++ = c;
2161 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002162}
2163
Victor Stinner96865452011-03-01 23:44:09 +00002164/* helper for PyUnicode_FromFormatV() */
2165
2166static const char*
2167parse_format_flags(const char *f,
2168 int *p_width, int *p_precision,
2169 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2170{
2171 int width, precision, longflag, longlongflag, size_tflag;
2172
2173 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2174 f++;
2175 width = 0;
2176 while (Py_ISDIGIT((unsigned)*f))
2177 width = (width*10) + *f++ - '0';
2178 precision = 0;
2179 if (*f == '.') {
2180 f++;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 precision = (precision*10) + *f++ - '0';
2183 if (*f == '%') {
2184 /* "%.3%s" => f points to "3" */
2185 f--;
2186 }
2187 }
2188 if (*f == '\0') {
2189 /* bogus format "%.1" => go backward, f points to "1" */
2190 f--;
2191 }
2192 if (p_width != NULL)
2193 *p_width = width;
2194 if (p_precision != NULL)
2195 *p_precision = precision;
2196
2197 /* Handle %ld, %lu, %lld and %llu. */
2198 longflag = 0;
2199 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002200 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002201
2202 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longflag = 1;
2205 ++f;
2206 }
2207#ifdef HAVE_LONG_LONG
2208 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longlongflag = 1;
2211 f += 2;
2212 }
2213#endif
2214 }
2215 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 size_tflag = 1;
2218 ++f;
2219 }
2220 if (p_longflag != NULL)
2221 *p_longflag = longflag;
2222 if (p_longlongflag != NULL)
2223 *p_longlongflag = longlongflag;
2224 if (p_size_tflag != NULL)
2225 *p_size_tflag = size_tflag;
2226 return f;
2227}
2228
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229/* maximum number of characters required for output of %ld. 21 characters
2230 allows for 64-bit integers (in decimal) and an optional sign. */
2231#define MAX_LONG_CHARS 21
2232/* maximum number of characters required for output of %lld.
2233 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2234 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2235#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormatV(const char *format, va_list vargs)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_list count;
2241 Py_ssize_t callcount = 0;
2242 PyObject **callresults = NULL;
2243 PyObject **callresult = NULL;
2244 Py_ssize_t n = 0;
2245 int width = 0;
2246 int precision = 0;
2247 int zeropad;
2248 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002249 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002251 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2253 Py_UCS4 argmaxchar;
2254 Py_ssize_t numbersize = 0;
2255 char *numberresults = NULL;
2256 char *numberresult = NULL;
2257 Py_ssize_t i;
2258 int kind;
2259 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002260
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002261 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002262 /* step 1: count the number of %S/%R/%A/%s format specifications
2263 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2264 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002266 * also estimate a upper bound for all the number formats in the string,
2267 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 for (f = format; *f; f++) {
2270 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002271 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2273 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2274 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2275 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278#ifdef HAVE_LONG_LONG
2279 if (longlongflag) {
2280 if (width < MAX_LONG_LONG_CHARS)
2281 width = MAX_LONG_LONG_CHARS;
2282 }
2283 else
2284#endif
2285 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2286 including sign. Decimal takes the most space. This
2287 isn't enough for octal. If a width is specified we
2288 need more (which we allocate later). */
2289 if (width < MAX_LONG_CHARS)
2290 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291
2292 /* account for the size + '\0' to separate numbers
2293 inside of the numberresults buffer */
2294 numbersize += (width + 1);
2295 }
2296 }
2297 else if ((unsigned char)*f > 127) {
2298 PyErr_Format(PyExc_ValueError,
2299 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2300 "string, got a non-ASCII byte: 0x%02x",
2301 (unsigned char)*f);
2302 return NULL;
2303 }
2304 }
2305 /* step 2: allocate memory for the results of
2306 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2307 if (callcount) {
2308 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2309 if (!callresults) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313 callresult = callresults;
2314 }
2315 /* step 2.5: allocate memory for the results of formating numbers */
2316 if (numbersize) {
2317 numberresults = PyObject_Malloc(numbersize);
2318 if (!numberresults) {
2319 PyErr_NoMemory();
2320 goto fail;
2321 }
2322 numberresult = numberresults;
2323 }
2324
2325 /* step 3: format numbers and figure out how large a buffer we need */
2326 for (f = format; *f; f++) {
2327 if (*f == '%') {
2328 const char* p;
2329 int longflag;
2330 int longlongflag;
2331 int size_tflag;
2332 int numprinted;
2333
2334 p = f;
2335 zeropad = (f[1] == '0');
2336 f = parse_format_flags(f, &width, &precision,
2337 &longflag, &longlongflag, &size_tflag);
2338 switch (*f) {
2339 case 'c':
2340 {
2341 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002342 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 n++;
2344 break;
2345 }
2346 case '%':
2347 n++;
2348 break;
2349 case 'i':
2350 case 'd':
2351 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2352 width, precision, *f);
2353 if (longflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, long));
2356#ifdef HAVE_LONG_LONG
2357 else if (longlongflag)
2358 numprinted = sprintf(numberresult, fmt,
2359 va_arg(count, PY_LONG_LONG));
2360#endif
2361 else if (size_tflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, Py_ssize_t));
2364 else
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, int));
2367 n += numprinted;
2368 /* advance by +1 to skip over the '\0' */
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
2374 break;
2375 case 'u':
2376 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2377 width, precision, 'u');
2378 if (longflag)
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, unsigned long));
2381#ifdef HAVE_LONG_LONG
2382 else if (longlongflag)
2383 numprinted = sprintf(numberresult, fmt,
2384 va_arg(count, unsigned PY_LONG_LONG));
2385#endif
2386 else if (size_tflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, size_t));
2389 else
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned int));
2392 n += numprinted;
2393 numberresult += (numprinted + 1);
2394 assert(*(numberresult - 1) == '\0');
2395 assert(*(numberresult - 2) != '\0');
2396 assert(numprinted >= 0);
2397 assert(numberresult <= numberresults + numbersize);
2398 break;
2399 case 'x':
2400 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2401 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2402 n += numprinted;
2403 numberresult += (numprinted + 1);
2404 assert(*(numberresult - 1) == '\0');
2405 assert(*(numberresult - 2) != '\0');
2406 assert(numprinted >= 0);
2407 assert(numberresult <= numberresults + numbersize);
2408 break;
2409 case 'p':
2410 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2411 /* %p is ill-defined: ensure leading 0x. */
2412 if (numberresult[1] == 'X')
2413 numberresult[1] = 'x';
2414 else if (numberresult[1] != 'x') {
2415 memmove(numberresult + 2, numberresult,
2416 strlen(numberresult) + 1);
2417 numberresult[0] = '0';
2418 numberresult[1] = 'x';
2419 numprinted += 2;
2420 }
2421 n += numprinted;
2422 numberresult += (numprinted + 1);
2423 assert(*(numberresult - 1) == '\0');
2424 assert(*(numberresult - 2) != '\0');
2425 assert(numprinted >= 0);
2426 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 break;
2428 case 's':
2429 {
2430 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002431 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002432 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002433 if (!str)
2434 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 /* since PyUnicode_DecodeUTF8 returns already flexible
2436 unicode objects, there is no need to call ready on them */
2437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002440 /* Remember the str and switch to the next slot */
2441 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 }
2444 case 'U':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002447 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (PyUnicode_READY(obj) == -1)
2449 goto fail;
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'V':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002459 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (PyUnicode_READY(obj) == -1)
2464 goto fail;
2465 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002466 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 *callresult++ = NULL;
2469 }
2470 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002471 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 if (!str_obj)
2473 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002474 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002475 Py_DECREF(str_obj);
2476 goto fail;
2477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = str_obj;
2482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 break;
2484 }
2485 case 'S':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *str;
2489 assert(obj);
2490 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002491 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002493 if (PyUnicode_READY(str) == -1) {
2494 Py_DECREF(str);
2495 goto fail;
2496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002498 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 /* Remember the str and switch to the next slot */
2501 *callresult++ = str;
2502 break;
2503 }
2504 case 'R':
2505 {
2506 PyObject *obj = va_arg(count, PyObject *);
2507 PyObject *repr;
2508 assert(obj);
2509 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002510 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002512 if (PyUnicode_READY(repr) == -1) {
2513 Py_DECREF(repr);
2514 goto fail;
2515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002517 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 /* Remember the repr and switch to the next slot */
2520 *callresult++ = repr;
2521 break;
2522 }
2523 case 'A':
2524 {
2525 PyObject *obj = va_arg(count, PyObject *);
2526 PyObject *ascii;
2527 assert(obj);
2528 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002529 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002531 if (PyUnicode_READY(ascii) == -1) {
2532 Py_DECREF(ascii);
2533 goto fail;
2534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002535 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002536 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 /* Remember the repr and switch to the next slot */
2539 *callresult++ = ascii;
2540 break;
2541 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 default:
2543 /* if we stumble upon an unknown
2544 formatting code, copy the rest of
2545 the format string to the output
2546 string. (we cannot just skip the
2547 code, since there's no way to know
2548 what's in the argument list) */
2549 n += strlen(p);
2550 goto expand;
2551 }
2552 } else
2553 n++;
2554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002555 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 we don't have to resize the string.
2559 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002560 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 if (!string)
2562 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 kind = PyUnicode_KIND(string);
2564 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002570 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002571
2572 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2574 /* checking for == because the last argument could be a empty
2575 string, which causes i to point to end, the assert at the end of
2576 the loop */
2577 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002578
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 switch (*f) {
2580 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002581 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 const int ordinal = va_arg(vargs, int);
2583 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002585 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002586 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 case 'p':
2591 /* unused, since we already have the result */
2592 if (*f == 'p')
2593 (void) va_arg(vargs, void *);
2594 else
2595 (void) va_arg(vargs, int);
2596 /* extract the result from numberresults and append. */
2597 for (; *numberresult; ++i, ++numberresult)
2598 PyUnicode_WRITE(kind, data, i, *numberresult);
2599 /* skip over the separating '\0' */
2600 assert(*numberresult == '\0');
2601 numberresult++;
2602 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 break;
2604 case 's':
2605 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002606 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002608 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 size = PyUnicode_GET_LENGTH(*callresult);
2610 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002611 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002613 /* We're done with the unicode()/repr() => forget it */
2614 Py_DECREF(*callresult);
2615 /* switch to next unicode()/repr() result */
2616 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002617 break;
2618 }
2619 case 'U':
2620 {
2621 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 Py_ssize_t size;
2623 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2624 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 break;
2628 }
2629 case 'V':
2630 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 size = PyUnicode_GET_LENGTH(obj);
2636 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002637 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 size = PyUnicode_GET_LENGTH(*callresult);
2641 assert(PyUnicode_KIND(*callresult) <=
2642 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002643 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002645 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002647 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 break;
2649 }
2650 case 'S':
2651 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002652 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002654 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 /* unused, since we already have the result */
2656 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002658 copy_characters(string, i, *callresult, 0, size);
2659 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 /* We're done with the unicode()/repr() => forget it */
2661 Py_DECREF(*callresult);
2662 /* switch to next unicode()/repr() result */
2663 ++callresult;
2664 break;
2665 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 break;
2669 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 for (; *p; ++p, ++i)
2671 PyUnicode_WRITE(kind, data, i, *p);
2672 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 goto end;
2674 }
Victor Stinner1205f272010-09-11 00:54:47 +00002675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 else {
2677 assert(i < PyUnicode_GET_LENGTH(string));
2678 PyUnicode_WRITE(kind, data, i++, *f);
2679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002682
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 if (callresults)
2685 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002688 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 if (callresults) {
2691 PyObject **callresult2 = callresults;
2692 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002693 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 ++callresult2;
2695 }
2696 PyObject_Free(callresults);
2697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 if (numberresults)
2699 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701}
2702
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703PyObject *
2704PyUnicode_FromFormat(const char *format, ...)
2705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 PyObject* ret;
2707 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
2709#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002711#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 ret = PyUnicode_FromFormatV(format, vargs);
2715 va_end(vargs);
2716 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717}
2718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719#ifdef HAVE_WCHAR_H
2720
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2722 convert a Unicode object to a wide character string.
2723
Victor Stinnerd88d9832011-09-06 02:00:05 +02002724 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725 character) required to convert the unicode object. Ignore size argument.
2726
Victor Stinnerd88d9832011-09-06 02:00:05 +02002727 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002728 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002729 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002731unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 wchar_t *w,
2733 Py_ssize_t size)
2734{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002735 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 const wchar_t *wstr;
2737
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002738 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 if (wstr == NULL)
2740 return -1;
2741
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743 if (size > res)
2744 size = res + 1;
2745 else
2746 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 return res;
2749 }
2750 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002752}
2753
2754Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002755PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 wchar_t *w,
2757 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758{
2759 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 PyErr_BadInternalCall();
2761 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002763 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764}
2765
Victor Stinner137c34c2010-09-29 10:25:54 +00002766wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002767PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002768 Py_ssize_t *size)
2769{
2770 wchar_t* buffer;
2771 Py_ssize_t buflen;
2772
2773 if (unicode == NULL) {
2774 PyErr_BadInternalCall();
2775 return NULL;
2776 }
2777
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002778 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 if (buflen == -1)
2780 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 PyErr_NoMemory();
2783 return NULL;
2784 }
2785
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2787 if (buffer == NULL) {
2788 PyErr_NoMemory();
2789 return NULL;
2790 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002791 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 if (buflen == -1)
2793 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794 if (size != NULL)
2795 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002796 return buffer;
2797}
2798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800
Alexander Belopolsky40018472011-02-26 01:02:56 +00002801PyObject *
2802PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002805 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 PyErr_SetString(PyExc_ValueError,
2807 "chr() arg not in range(0x110000)");
2808 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (ordinal < 256)
2812 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 v = PyUnicode_New(1, ordinal);
2815 if (v == NULL)
2816 return NULL;
2817 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002818 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002820}
2821
Alexander Belopolsky40018472011-02-26 01:02:56 +00002822PyObject *
2823PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002828 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002829 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 Py_INCREF(obj);
2831 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002832 }
2833 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 /* For a Unicode subtype that's not a Unicode object,
2835 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002836 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002837 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002838 PyErr_Format(PyExc_TypeError,
2839 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002840 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002841 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002842}
2843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002846 const char *encoding,
2847 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002848{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002849 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 PyErr_BadInternalCall();
2854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002856
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 /* Decoding bytes objects is the most common case and should be fast */
2858 if (PyBytes_Check(obj)) {
2859 if (PyBytes_GET_SIZE(obj) == 0) {
2860 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002861 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 }
2863 else {
2864 v = PyUnicode_Decode(
2865 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2866 encoding, errors);
2867 }
2868 return v;
2869 }
2870
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002871 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002872 PyErr_SetString(PyExc_TypeError,
2873 "decoding str is not supported");
2874 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002875 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002876
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2878 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2879 PyErr_Format(PyExc_TypeError,
2880 "coercing to str: need bytes, bytearray "
2881 "or buffer-like object, %.80s found",
2882 Py_TYPE(obj)->tp_name);
2883 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002884 }
Tim Petersced69f82003-09-16 20:30:58 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002887 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002888 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 }
Tim Petersced69f82003-09-16 20:30:58 +00002890 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002892
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002894 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895}
2896
Victor Stinner600d3be2010-06-10 12:00:55 +00002897/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002898 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2899 1 on success. */
2900static int
2901normalize_encoding(const char *encoding,
2902 char *lower,
2903 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002905 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002906 char *l;
2907 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002909 if (encoding == NULL) {
2910 strcpy(lower, "utf-8");
2911 return 1;
2912 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002913 e = encoding;
2914 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002915 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002916 while (*e) {
2917 if (l == l_end)
2918 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002919 if (Py_ISUPPER(*e)) {
2920 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002921 }
2922 else if (*e == '_') {
2923 *l++ = '-';
2924 e++;
2925 }
2926 else {
2927 *l++ = *e++;
2928 }
2929 }
2930 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002931 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002932}
2933
Alexander Belopolsky40018472011-02-26 01:02:56 +00002934PyObject *
2935PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002936 Py_ssize_t size,
2937 const char *encoding,
2938 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002939{
2940 PyObject *buffer = NULL, *unicode;
2941 Py_buffer info;
2942 char lower[11]; /* Enough for any encoding shortcut */
2943
Fred Drakee4315f52000-05-09 19:53:39 +00002944 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002945 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002946 if ((strcmp(lower, "utf-8") == 0) ||
2947 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002948 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002949 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002950 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002951 (strcmp(lower, "iso-8859-1") == 0))
2952 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002953#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002954 else if (strcmp(lower, "mbcs") == 0)
2955 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002956#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002957 else if (strcmp(lower, "ascii") == 0)
2958 return PyUnicode_DecodeASCII(s, size, errors);
2959 else if (strcmp(lower, "utf-16") == 0)
2960 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2961 else if (strcmp(lower, "utf-32") == 0)
2962 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964
2965 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002966 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002967 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002968 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002969 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 if (buffer == NULL)
2971 goto onError;
2972 unicode = PyCodec_Decode(buffer, encoding, errors);
2973 if (unicode == NULL)
2974 goto onError;
2975 if (!PyUnicode_Check(unicode)) {
2976 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002977 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002978 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_DECREF(unicode);
2980 goto onError;
2981 }
2982 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002983 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002984
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 Py_XDECREF(buffer);
2987 return NULL;
2988}
2989
Alexander Belopolsky40018472011-02-26 01:02:56 +00002990PyObject *
2991PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002992 const char *encoding,
2993 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002994{
2995 PyObject *v;
2996
2997 if (!PyUnicode_Check(unicode)) {
2998 PyErr_BadArgument();
2999 goto onError;
3000 }
3001
3002 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003004
3005 /* Decode via the codec registry */
3006 v = PyCodec_Decode(unicode, encoding, errors);
3007 if (v == NULL)
3008 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003009 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003010
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 const char *encoding,
3018 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003019{
3020 PyObject *v;
3021
3022 if (!PyUnicode_Check(unicode)) {
3023 PyErr_BadArgument();
3024 goto onError;
3025 }
3026
3027 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003029
3030 /* Decode via the codec registry */
3031 v = PyCodec_Decode(unicode, encoding, errors);
3032 if (v == NULL)
3033 goto onError;
3034 if (!PyUnicode_Check(v)) {
3035 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003036 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037 Py_TYPE(v)->tp_name);
3038 Py_DECREF(v);
3039 goto onError;
3040 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003041 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003042
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044 return NULL;
3045}
3046
Alexander Belopolsky40018472011-02-26 01:02:56 +00003047PyObject *
3048PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003049 Py_ssize_t size,
3050 const char *encoding,
3051 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052{
3053 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003054
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 unicode = PyUnicode_FromUnicode(s, size);
3056 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3059 Py_DECREF(unicode);
3060 return v;
3061}
3062
Alexander Belopolsky40018472011-02-26 01:02:56 +00003063PyObject *
3064PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003065 const char *encoding,
3066 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003067{
3068 PyObject *v;
3069
3070 if (!PyUnicode_Check(unicode)) {
3071 PyErr_BadArgument();
3072 goto onError;
3073 }
3074
3075 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
3078 /* Encode via the codec registry */
3079 v = PyCodec_Encode(unicode, encoding, errors);
3080 if (v == NULL)
3081 goto onError;
3082 return v;
3083
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003085 return NULL;
3086}
3087
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003088static size_t
3089wcstombs_errorpos(const wchar_t *wstr)
3090{
3091 size_t len;
3092#if SIZEOF_WCHAR_T == 2
3093 wchar_t buf[3];
3094#else
3095 wchar_t buf[2];
3096#endif
3097 char outbuf[MB_LEN_MAX];
3098 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003099
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003100#if SIZEOF_WCHAR_T == 2
3101 buf[2] = 0;
3102#else
3103 buf[1] = 0;
3104#endif
3105 start = wstr;
3106 while (*wstr != L'\0')
3107 {
3108 previous = wstr;
3109#if SIZEOF_WCHAR_T == 2
3110 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3111 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3112 {
3113 buf[0] = wstr[0];
3114 buf[1] = wstr[1];
3115 wstr += 2;
3116 }
3117 else {
3118 buf[0] = *wstr;
3119 buf[1] = 0;
3120 wstr++;
3121 }
3122#else
3123 buf[0] = *wstr;
3124 wstr++;
3125#endif
3126 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003127 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003128 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003129 }
3130
3131 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003132 return 0;
3133}
3134
Victor Stinner1b579672011-12-17 05:47:23 +01003135static int
3136locale_error_handler(const char *errors, int *surrogateescape)
3137{
3138 if (errors == NULL) {
3139 *surrogateescape = 0;
3140 return 0;
3141 }
3142
3143 if (strcmp(errors, "strict") == 0) {
3144 *surrogateescape = 0;
3145 return 0;
3146 }
3147 if (strcmp(errors, "surrogateescape") == 0) {
3148 *surrogateescape = 1;
3149 return 0;
3150 }
3151 PyErr_Format(PyExc_ValueError,
3152 "only 'strict' and 'surrogateescape' error handlers "
3153 "are supported, not '%s'",
3154 errors);
3155 return -1;
3156}
3157
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003158PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003159PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160{
3161 Py_ssize_t wlen, wlen2;
3162 wchar_t *wstr;
3163 PyObject *bytes = NULL;
3164 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003165 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003166 PyObject *exc;
3167 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003168 int surrogateescape;
3169
3170 if (locale_error_handler(errors, &surrogateescape) < 0)
3171 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172
3173 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3174 if (wstr == NULL)
3175 return NULL;
3176
3177 wlen2 = wcslen(wstr);
3178 if (wlen2 != wlen) {
3179 PyMem_Free(wstr);
3180 PyErr_SetString(PyExc_TypeError, "embedded null character");
3181 return NULL;
3182 }
3183
3184 if (surrogateescape) {
3185 /* locale encoding with surrogateescape */
3186 char *str;
3187
3188 str = _Py_wchar2char(wstr, &error_pos);
3189 if (str == NULL) {
3190 if (error_pos == (size_t)-1) {
3191 PyErr_NoMemory();
3192 PyMem_Free(wstr);
3193 return NULL;
3194 }
3195 else {
3196 goto encode_error;
3197 }
3198 }
3199 PyMem_Free(wstr);
3200
3201 bytes = PyBytes_FromString(str);
3202 PyMem_Free(str);
3203 }
3204 else {
3205 size_t len, len2;
3206
3207 len = wcstombs(NULL, wstr, 0);
3208 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003209 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003210 goto encode_error;
3211 }
3212
3213 bytes = PyBytes_FromStringAndSize(NULL, len);
3214 if (bytes == NULL) {
3215 PyMem_Free(wstr);
3216 return NULL;
3217 }
3218
3219 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3220 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003221 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003222 goto encode_error;
3223 }
3224 PyMem_Free(wstr);
3225 }
3226 return bytes;
3227
3228encode_error:
3229 errmsg = strerror(errno);
3230 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003231
3232 if (error_pos == (size_t)-1)
3233 error_pos = wcstombs_errorpos(wstr);
3234
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235 PyMem_Free(wstr);
3236 Py_XDECREF(bytes);
3237
Victor Stinner2f197072011-12-17 07:08:30 +01003238 if (errmsg != NULL) {
3239 size_t errlen;
3240 wstr = _Py_char2wchar(errmsg, &errlen);
3241 if (wstr != NULL) {
3242 reason = PyUnicode_FromWideChar(wstr, errlen);
3243 PyMem_Free(wstr);
3244 } else
3245 errmsg = NULL;
3246 }
3247 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003248 reason = PyUnicode_FromString(
3249 "wcstombs() encountered an unencodable "
3250 "wide character");
3251 if (reason == NULL)
3252 return NULL;
3253
3254 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3255 "locale", unicode,
3256 (Py_ssize_t)error_pos,
3257 (Py_ssize_t)(error_pos+1),
3258 reason);
3259 Py_DECREF(reason);
3260 if (exc != NULL) {
3261 PyCodec_StrictErrors(exc);
3262 Py_XDECREF(exc);
3263 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003264 return NULL;
3265}
3266
Victor Stinnerad158722010-10-27 00:25:46 +00003267PyObject *
3268PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003269{
Victor Stinner99b95382011-07-04 14:23:54 +02003270#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003271 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003272#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003273 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003274#else
Victor Stinner793b5312011-04-27 00:24:21 +02003275 PyInterpreterState *interp = PyThreadState_GET()->interp;
3276 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3277 cannot use it to encode and decode filenames before it is loaded. Load
3278 the Python codec requires to encode at least its own filename. Use the C
3279 version of the locale codec until the codec registry is initialized and
3280 the Python codec is loaded.
3281
3282 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3283 cannot only rely on it: check also interp->fscodec_initialized for
3284 subinterpreters. */
3285 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003286 return PyUnicode_AsEncodedString(unicode,
3287 Py_FileSystemDefaultEncoding,
3288 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003289 }
3290 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003291 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003292 }
Victor Stinnerad158722010-10-27 00:25:46 +00003293#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003294}
3295
Alexander Belopolsky40018472011-02-26 01:02:56 +00003296PyObject *
3297PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003298 const char *encoding,
3299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300{
3301 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003302 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003303
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 if (!PyUnicode_Check(unicode)) {
3305 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 }
Fred Drakee4315f52000-05-09 19:53:39 +00003308
Fred Drakee4315f52000-05-09 19:53:39 +00003309 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003310 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003311 if ((strcmp(lower, "utf-8") == 0) ||
3312 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003313 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003314 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003315 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003316 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003318 }
Victor Stinner37296e82010-06-10 13:36:23 +00003319 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003320 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003321 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003323#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003324 else if (strcmp(lower, "mbcs") == 0)
3325 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003326#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003327 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003328 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330
3331 /* Encode via the codec registry */
3332 v = PyCodec_Encode(unicode, encoding, errors);
3333 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003334 return NULL;
3335
3336 /* The normal path */
3337 if (PyBytes_Check(v))
3338 return v;
3339
3340 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003341 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003342 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003343 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003344
3345 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3346 "encoder %s returned bytearray instead of bytes",
3347 encoding);
3348 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003349 Py_DECREF(v);
3350 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003351 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003352
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003353 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3354 Py_DECREF(v);
3355 return b;
3356 }
3357
3358 PyErr_Format(PyExc_TypeError,
3359 "encoder did not return a bytes object (type=%.400s)",
3360 Py_TYPE(v)->tp_name);
3361 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003362 return NULL;
3363}
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 const char *encoding,
3368 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003369{
3370 PyObject *v;
3371
3372 if (!PyUnicode_Check(unicode)) {
3373 PyErr_BadArgument();
3374 goto onError;
3375 }
3376
3377 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003379
3380 /* Encode via the codec registry */
3381 v = PyCodec_Encode(unicode, encoding, errors);
3382 if (v == NULL)
3383 goto onError;
3384 if (!PyUnicode_Check(v)) {
3385 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003386 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003387 Py_TYPE(v)->tp_name);
3388 Py_DECREF(v);
3389 goto onError;
3390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003392
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 return NULL;
3395}
3396
Victor Stinner2f197072011-12-17 07:08:30 +01003397static size_t
3398mbstowcs_errorpos(const char *str, size_t len)
3399{
3400#ifdef HAVE_MBRTOWC
3401 const char *start = str;
3402 mbstate_t mbs;
3403 size_t converted;
3404 wchar_t ch;
3405
3406 memset(&mbs, 0, sizeof mbs);
3407 while (len)
3408 {
3409 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3410 if (converted == 0)
3411 /* Reached end of string */
3412 break;
3413 if (converted == (size_t)-1 || converted == (size_t)-2) {
3414 /* Conversion error or incomplete character */
3415 return str - start;
3416 }
3417 else {
3418 str += converted;
3419 len -= converted;
3420 }
3421 }
3422 /* failed to find the undecodable byte sequence */
3423 return 0;
3424#endif
3425 return 0;
3426}
3427
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003428PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003429PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003430 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003431{
3432 wchar_t smallbuf[256];
3433 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3434 wchar_t *wstr;
3435 size_t wlen, wlen2;
3436 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003437 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003438 size_t error_pos;
3439 char *errmsg;
3440 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003441
3442 if (locale_error_handler(errors, &surrogateescape) < 0)
3443 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003444
3445 if (str[len] != '\0' || len != strlen(str)) {
3446 PyErr_SetString(PyExc_TypeError, "embedded null character");
3447 return NULL;
3448 }
3449
3450 if (surrogateescape)
3451 {
3452 wstr = _Py_char2wchar(str, &wlen);
3453 if (wstr == NULL) {
3454 if (wlen == (size_t)-1)
3455 PyErr_NoMemory();
3456 else
3457 PyErr_SetFromErrno(PyExc_OSError);
3458 return NULL;
3459 }
3460
3461 unicode = PyUnicode_FromWideChar(wstr, wlen);
3462 PyMem_Free(wstr);
3463 }
3464 else {
3465#ifndef HAVE_BROKEN_MBSTOWCS
3466 wlen = mbstowcs(NULL, str, 0);
3467#else
3468 wlen = len;
3469#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003470 if (wlen == (size_t)-1)
3471 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003472 if (wlen+1 <= smallbuf_len) {
3473 wstr = smallbuf;
3474 }
3475 else {
3476 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3477 return PyErr_NoMemory();
3478
3479 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3480 if (!wstr)
3481 return PyErr_NoMemory();
3482 }
3483
3484 /* This shouldn't fail now */
3485 wlen2 = mbstowcs(wstr, str, wlen+1);
3486 if (wlen2 == (size_t)-1) {
3487 if (wstr != smallbuf)
3488 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003489 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003490 }
3491#ifdef HAVE_BROKEN_MBSTOWCS
3492 assert(wlen2 == wlen);
3493#endif
3494 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3495 if (wstr != smallbuf)
3496 PyMem_Free(wstr);
3497 }
3498 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003499
3500decode_error:
3501 errmsg = strerror(errno);
3502 assert(errmsg != NULL);
3503
3504 error_pos = mbstowcs_errorpos(str, len);
3505 if (errmsg != NULL) {
3506 size_t errlen;
3507 wstr = _Py_char2wchar(errmsg, &errlen);
3508 if (wstr != NULL) {
3509 reason = PyUnicode_FromWideChar(wstr, errlen);
3510 PyMem_Free(wstr);
3511 } else
3512 errmsg = NULL;
3513 }
3514 if (errmsg == NULL)
3515 reason = PyUnicode_FromString(
3516 "mbstowcs() encountered an invalid multibyte sequence");
3517 if (reason == NULL)
3518 return NULL;
3519
3520 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3521 "locale", str, len,
3522 (Py_ssize_t)error_pos,
3523 (Py_ssize_t)(error_pos+1),
3524 reason);
3525 Py_DECREF(reason);
3526 if (exc != NULL) {
3527 PyCodec_StrictErrors(exc);
3528 Py_XDECREF(exc);
3529 }
3530 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531}
3532
3533PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003534PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003535{
3536 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003537 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003538}
3539
3540
3541PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003542PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003543 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003544 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3545}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003546
Christian Heimes5894ba72007-11-04 11:43:14 +00003547PyObject*
3548PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3549{
Victor Stinner99b95382011-07-04 14:23:54 +02003550#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003551 return PyUnicode_DecodeMBCS(s, size, NULL);
3552#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003553 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003554#else
Victor Stinner793b5312011-04-27 00:24:21 +02003555 PyInterpreterState *interp = PyThreadState_GET()->interp;
3556 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3557 cannot use it to encode and decode filenames before it is loaded. Load
3558 the Python codec requires to encode at least its own filename. Use the C
3559 version of the locale codec until the codec registry is initialized and
3560 the Python codec is loaded.
3561
3562 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3563 cannot only rely on it: check also interp->fscodec_initialized for
3564 subinterpreters. */
3565 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003566 return PyUnicode_Decode(s, size,
3567 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003568 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003569 }
3570 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003571 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572 }
Victor Stinnerad158722010-10-27 00:25:46 +00003573#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003574}
3575
Martin v. Löwis011e8422009-05-05 04:43:17 +00003576
3577int
Antoine Pitrou13348842012-01-29 18:36:34 +01003578_PyUnicode_HasNULChars(PyObject* s)
3579{
3580 static PyObject *nul = NULL;
3581
3582 if (nul == NULL)
3583 nul = PyUnicode_FromStringAndSize("\0", 1);
3584 if (nul == NULL)
3585 return -1;
3586 return PyUnicode_Contains(s, nul);
3587}
3588
3589
3590int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003591PyUnicode_FSConverter(PyObject* arg, void* addr)
3592{
3593 PyObject *output = NULL;
3594 Py_ssize_t size;
3595 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003596 if (arg == NULL) {
3597 Py_DECREF(*(PyObject**)addr);
3598 return 1;
3599 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003600 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003601 output = arg;
3602 Py_INCREF(output);
3603 }
3604 else {
3605 arg = PyUnicode_FromObject(arg);
3606 if (!arg)
3607 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003608 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003609 Py_DECREF(arg);
3610 if (!output)
3611 return 0;
3612 if (!PyBytes_Check(output)) {
3613 Py_DECREF(output);
3614 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3615 return 0;
3616 }
3617 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003618 size = PyBytes_GET_SIZE(output);
3619 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003621 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003622 Py_DECREF(output);
3623 return 0;
3624 }
3625 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003626 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003627}
3628
3629
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003630int
3631PyUnicode_FSDecoder(PyObject* arg, void* addr)
3632{
3633 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003634 if (arg == NULL) {
3635 Py_DECREF(*(PyObject**)addr);
3636 return 1;
3637 }
3638 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003639 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003640 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003641 output = arg;
3642 Py_INCREF(output);
3643 }
3644 else {
3645 arg = PyBytes_FromObject(arg);
3646 if (!arg)
3647 return 0;
3648 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3649 PyBytes_GET_SIZE(arg));
3650 Py_DECREF(arg);
3651 if (!output)
3652 return 0;
3653 if (!PyUnicode_Check(output)) {
3654 Py_DECREF(output);
3655 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3656 return 0;
3657 }
3658 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003659 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003660 Py_DECREF(output);
3661 return 0;
3662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003664 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003665 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3666 Py_DECREF(output);
3667 return 0;
3668 }
3669 *(PyObject**)addr = output;
3670 return Py_CLEANUP_SUPPORTED;
3671}
3672
3673
Martin v. Löwis5b222132007-06-10 09:51:05 +00003674char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003675PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003676{
Christian Heimesf3863112007-11-22 07:46:41 +00003677 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003678
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003679 if (!PyUnicode_Check(unicode)) {
3680 PyErr_BadArgument();
3681 return NULL;
3682 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003683 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003684 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003685
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003686 if (PyUnicode_UTF8(unicode) == NULL) {
3687 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003688 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3689 if (bytes == NULL)
3690 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003691 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3692 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003693 Py_DECREF(bytes);
3694 return NULL;
3695 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003696 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3697 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3698 PyBytes_AS_STRING(bytes),
3699 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003700 Py_DECREF(bytes);
3701 }
3702
3703 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003704 *psize = PyUnicode_UTF8_LENGTH(unicode);
3705 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003706}
3707
3708char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3712}
3713
3714#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003715static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716#endif
3717
3718
3719Py_UNICODE *
3720PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 const unsigned char *one_byte;
3723#if SIZEOF_WCHAR_T == 4
3724 const Py_UCS2 *two_bytes;
3725#else
3726 const Py_UCS4 *four_bytes;
3727 const Py_UCS4 *ucs4_end;
3728 Py_ssize_t num_surrogates;
3729#endif
3730 wchar_t *w;
3731 wchar_t *wchar_end;
3732
3733 if (!PyUnicode_Check(unicode)) {
3734 PyErr_BadArgument();
3735 return NULL;
3736 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003737 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003739 assert(_PyUnicode_KIND(unicode) != 0);
3740 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741
3742#ifdef Py_DEBUG
3743 ++unicode_as_unicode_calls;
3744#endif
3745
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003746 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003748 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3749 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003750 num_surrogates = 0;
3751
3752 for (; four_bytes < ucs4_end; ++four_bytes) {
3753 if (*four_bytes > 0xFFFF)
3754 ++num_surrogates;
3755 }
3756
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3758 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3759 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 PyErr_NoMemory();
3761 return NULL;
3762 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003763 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 w = _PyUnicode_WSTR(unicode);
3766 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3767 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3769 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003770 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003772 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3773 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774 }
3775 else
3776 *w = *four_bytes;
3777
3778 if (w > wchar_end) {
3779 assert(0 && "Miscalculated string end");
3780 }
3781 }
3782 *w = 0;
3783#else
3784 /* sizeof(wchar_t) == 4 */
3785 Py_FatalError("Impossible unicode object state, wstr and str "
3786 "should share memory already.");
3787 return NULL;
3788#endif
3789 }
3790 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003791 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3792 (_PyUnicode_LENGTH(unicode) + 1));
3793 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794 PyErr_NoMemory();
3795 return NULL;
3796 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003797 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3798 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3799 w = _PyUnicode_WSTR(unicode);
3800 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003802 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3803 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003804 for (; w < wchar_end; ++one_byte, ++w)
3805 *w = *one_byte;
3806 /* null-terminate the wstr */
3807 *w = 0;
3808 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003809 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812 for (; w < wchar_end; ++two_bytes, ++w)
3813 *w = *two_bytes;
3814 /* null-terminate the wstr */
3815 *w = 0;
3816#else
3817 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 PyObject_FREE(_PyUnicode_WSTR(unicode));
3819 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 Py_FatalError("Impossible unicode object state, wstr "
3821 "and str should share memory already.");
3822 return NULL;
3823#endif
3824 }
3825 else {
3826 assert(0 && "This should never happen.");
3827 }
3828 }
3829 }
3830 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003831 *size = PyUnicode_WSTR_LENGTH(unicode);
3832 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003833}
3834
Alexander Belopolsky40018472011-02-26 01:02:56 +00003835Py_UNICODE *
3836PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839}
3840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841
Alexander Belopolsky40018472011-02-26 01:02:56 +00003842Py_ssize_t
3843PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844{
3845 if (!PyUnicode_Check(unicode)) {
3846 PyErr_BadArgument();
3847 goto onError;
3848 }
3849 return PyUnicode_GET_SIZE(unicode);
3850
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 return -1;
3853}
3854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855Py_ssize_t
3856PyUnicode_GetLength(PyObject *unicode)
3857{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003858 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 PyErr_BadArgument();
3860 return -1;
3861 }
3862
3863 return PyUnicode_GET_LENGTH(unicode);
3864}
3865
3866Py_UCS4
3867PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3868{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003869 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3870 PyErr_BadArgument();
3871 return (Py_UCS4)-1;
3872 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003873 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003874 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 return (Py_UCS4)-1;
3876 }
3877 return PyUnicode_READ_CHAR(unicode, index);
3878}
3879
3880int
3881PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3882{
3883 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003884 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 return -1;
3886 }
Victor Stinner488fa492011-12-12 00:01:39 +01003887 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003888 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003889 PyErr_SetString(PyExc_IndexError, "string index out of range");
3890 return -1;
3891 }
Victor Stinner488fa492011-12-12 00:01:39 +01003892 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003893 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3895 index, ch);
3896 return 0;
3897}
3898
Alexander Belopolsky40018472011-02-26 01:02:56 +00003899const char *
3900PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003901{
Victor Stinner42cb4622010-09-01 19:39:01 +00003902 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003903}
3904
Victor Stinner554f3f02010-06-16 23:33:54 +00003905/* create or adjust a UnicodeDecodeError */
3906static void
3907make_decode_exception(PyObject **exceptionObject,
3908 const char *encoding,
3909 const char *input, Py_ssize_t length,
3910 Py_ssize_t startpos, Py_ssize_t endpos,
3911 const char *reason)
3912{
3913 if (*exceptionObject == NULL) {
3914 *exceptionObject = PyUnicodeDecodeError_Create(
3915 encoding, input, length, startpos, endpos, reason);
3916 }
3917 else {
3918 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3919 goto onError;
3920 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3921 goto onError;
3922 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3923 goto onError;
3924 }
3925 return;
3926
3927onError:
3928 Py_DECREF(*exceptionObject);
3929 *exceptionObject = NULL;
3930}
3931
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932/* error handling callback helper:
3933 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003934 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 and adjust various state variables.
3936 return 0 on success, -1 on error
3937*/
3938
Alexander Belopolsky40018472011-02-26 01:02:56 +00003939static int
3940unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003941 const char *encoding, const char *reason,
3942 const char **input, const char **inend, Py_ssize_t *startinpos,
3943 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003944 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003946 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947
3948 PyObject *restuple = NULL;
3949 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003950 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003951 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952 Py_ssize_t requiredsize;
3953 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003954 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 int res = -1;
3956
Victor Stinner596a6c42011-11-09 00:02:18 +01003957 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3958 outsize = PyUnicode_GET_LENGTH(*output);
3959 else
3960 outsize = _PyUnicode_WSTR_LENGTH(*output);
3961
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 *errorHandler = PyCodec_LookupError(errors);
3964 if (*errorHandler == NULL)
3965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 }
3967
Victor Stinner554f3f02010-06-16 23:33:54 +00003968 make_decode_exception(exceptionObject,
3969 encoding,
3970 *input, *inend - *input,
3971 *startinpos, *endinpos,
3972 reason);
3973 if (*exceptionObject == NULL)
3974 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975
3976 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3977 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003978 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003980 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 }
3983 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003985 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003986 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003987
3988 /* Copy back the bytes variables, which might have been modified by the
3989 callback */
3990 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3991 if (!inputobj)
3992 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003993 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003995 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003996 *input = PyBytes_AS_STRING(inputobj);
3997 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003998 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003999 /* we can DECREF safely, as the exception has another reference,
4000 so the object won't go away. */
4001 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004005 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4007 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004008 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009
Victor Stinner596a6c42011-11-09 00:02:18 +01004010 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4011 /* need more space? (at least enough for what we
4012 have+the replacement+the rest of the string (starting
4013 at the new input position), so we won't have to check space
4014 when there are no errors in the rest of the string) */
4015 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4016 requiredsize = *outpos + replen + insize-newpos;
4017 if (requiredsize > outsize) {
4018 if (requiredsize<2*outsize)
4019 requiredsize = 2*outsize;
4020 if (unicode_resize(output, requiredsize) < 0)
4021 goto onError;
4022 }
4023 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004024 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004025 copy_characters(*output, *outpos, repunicode, 0, replen);
4026 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004028 else {
4029 wchar_t *repwstr;
4030 Py_ssize_t repwlen;
4031 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4032 if (repwstr == NULL)
4033 goto onError;
4034 /* need more space? (at least enough for what we
4035 have+the replacement+the rest of the string (starting
4036 at the new input position), so we won't have to check space
4037 when there are no errors in the rest of the string) */
4038 requiredsize = *outpos + repwlen + insize-newpos;
4039 if (requiredsize > outsize) {
4040 if (requiredsize < 2*outsize)
4041 requiredsize = 2*outsize;
4042 if (unicode_resize(output, requiredsize) < 0)
4043 goto onError;
4044 }
4045 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4046 *outpos += repwlen;
4047 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004049 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004050
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 /* we made it! */
4052 res = 0;
4053
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 Py_XDECREF(restuple);
4056 return res;
4057}
4058
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004059/* --- UTF-7 Codec -------------------------------------------------------- */
4060
Antoine Pitrou244651a2009-05-04 18:56:13 +00004061/* See RFC2152 for details. We encode conservatively and decode liberally. */
4062
4063/* Three simple macros defining base-64. */
4064
4065/* Is c a base-64 character? */
4066
4067#define IS_BASE64(c) \
4068 (((c) >= 'A' && (c) <= 'Z') || \
4069 ((c) >= 'a' && (c) <= 'z') || \
4070 ((c) >= '0' && (c) <= '9') || \
4071 (c) == '+' || (c) == '/')
4072
4073/* given that c is a base-64 character, what is its base-64 value? */
4074
4075#define FROM_BASE64(c) \
4076 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4077 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4078 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4079 (c) == '+' ? 62 : 63)
4080
4081/* What is the base-64 character of the bottom 6 bits of n? */
4082
4083#define TO_BASE64(n) \
4084 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4085
4086/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4087 * decoded as itself. We are permissive on decoding; the only ASCII
4088 * byte not decoding to itself is the + which begins a base64
4089 * string. */
4090
4091#define DECODE_DIRECT(c) \
4092 ((c) <= 127 && (c) != '+')
4093
4094/* The UTF-7 encoder treats ASCII characters differently according to
4095 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4096 * the above). See RFC2152. This array identifies these different
4097 * sets:
4098 * 0 : "Set D"
4099 * alphanumeric and '(),-./:?
4100 * 1 : "Set O"
4101 * !"#$%&*;<=>@[]^_`{|}
4102 * 2 : "whitespace"
4103 * ht nl cr sp
4104 * 3 : special (must be base64 encoded)
4105 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4106 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004107
Tim Petersced69f82003-09-16 20:30:58 +00004108static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004109char utf7_category[128] = {
4110/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4111 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4112/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4113 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4114/* sp ! " # $ % & ' ( ) * + , - . / */
4115 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4116/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4118/* @ A B C D E F G H I J K L M N O */
4119 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4120/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4122/* ` a b c d e f g h i j k l m n o */
4123 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4124/* p q r s t u v w x y z { | } ~ del */
4125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004126};
4127
Antoine Pitrou244651a2009-05-04 18:56:13 +00004128/* ENCODE_DIRECT: this character should be encoded as itself. The
4129 * answer depends on whether we are encoding set O as itself, and also
4130 * on whether we are encoding whitespace as itself. RFC2152 makes it
4131 * clear that the answers to these questions vary between
4132 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004133
Antoine Pitrou244651a2009-05-04 18:56:13 +00004134#define ENCODE_DIRECT(c, directO, directWS) \
4135 ((c) < 128 && (c) > 0 && \
4136 ((utf7_category[(c)] == 0) || \
4137 (directWS && (utf7_category[(c)] == 2)) || \
4138 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004139
Alexander Belopolsky40018472011-02-26 01:02:56 +00004140PyObject *
4141PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004142 Py_ssize_t size,
4143 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004144{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004145 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4146}
4147
Antoine Pitrou244651a2009-05-04 18:56:13 +00004148/* The decoder. The only state we preserve is our read position,
4149 * i.e. how many characters we have consumed. So if we end in the
4150 * middle of a shift sequence we have to back off the read position
4151 * and the output to the beginning of the sequence, otherwise we lose
4152 * all the shift state (seen bits, number of bits seen, high
4153 * surrogate). */
4154
Alexander Belopolsky40018472011-02-26 01:02:56 +00004155PyObject *
4156PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004157 Py_ssize_t size,
4158 const char *errors,
4159 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t startinpos;
4163 Py_ssize_t endinpos;
4164 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004166 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004167 const char *errmsg = "";
4168 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004169 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004170 unsigned int base64bits = 0;
4171 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004172 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 PyObject *errorHandler = NULL;
4174 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004175
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004176 /* Start off assuming it's all ASCII. Widen later as necessary. */
4177 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004178 if (!unicode)
4179 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004180 if (size == 0) {
4181 if (consumed)
4182 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004183 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004184 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004185
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004186 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004187 e = s + size;
4188
4189 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004190 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004192 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004193
Antoine Pitrou244651a2009-05-04 18:56:13 +00004194 if (inShift) { /* in a base-64 section */
4195 if (IS_BASE64(ch)) { /* consume a base-64 character */
4196 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4197 base64bits += 6;
4198 s++;
4199 if (base64bits >= 16) {
4200 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004201 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004202 base64bits -= 16;
4203 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4204 if (surrogate) {
4205 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004206 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4207 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004208 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4209 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004210 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004211 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004212 }
4213 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004214 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4215 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004216 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004217 }
4218 }
Victor Stinner551ac952011-11-29 22:58:13 +01004219 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004220 /* first surrogate */
4221 surrogate = outCh;
4222 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004223 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004224 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4225 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004226 }
4227 }
4228 }
4229 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004230 inShift = 0;
4231 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004232 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004233 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4234 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004235 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004236 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004237 if (base64bits > 0) { /* left-over bits */
4238 if (base64bits >= 6) {
4239 /* We've seen at least one base-64 character */
4240 errmsg = "partial character in shift sequence";
4241 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004242 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004243 else {
4244 /* Some bits remain; they should be zero */
4245 if (base64buffer != 0) {
4246 errmsg = "non-zero padding bits in shift sequence";
4247 goto utf7Error;
4248 }
4249 }
4250 }
4251 if (ch != '-') {
4252 /* '-' is absorbed; other terminating
4253 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004254 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4255 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004257 }
4258 }
4259 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004261 s++; /* consume '+' */
4262 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004264 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4265 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004266 }
4267 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004269 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271 }
4272 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004273 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004274 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4275 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004276 s++;
4277 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004278 else {
4279 startinpos = s-starts;
4280 s++;
4281 errmsg = "unexpected special character";
4282 goto utf7Error;
4283 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004285utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 endinpos = s-starts;
4287 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 errors, &errorHandler,
4289 "utf7", errmsg,
4290 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004291 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293 }
4294
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 /* end of string */
4296
4297 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4298 /* if we're in an inconsistent state, that's an error */
4299 if (surrogate ||
4300 (base64bits >= 6) ||
4301 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004302 endinpos = size;
4303 if (unicode_decode_call_errorhandler(
4304 errors, &errorHandler,
4305 "utf7", "unterminated shift sequence",
4306 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004307 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004308 goto onError;
4309 if (s < e)
4310 goto restart;
4311 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313
4314 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004315 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004317 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004318 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 }
4320 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004321 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004323 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004325 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004326 goto onError;
4327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 Py_XDECREF(errorHandler);
4329 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004330 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004331
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 Py_XDECREF(errorHandler);
4334 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004335 Py_DECREF(unicode);
4336 return NULL;
4337}
4338
4339
Alexander Belopolsky40018472011-02-26 01:02:56 +00004340PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004341_PyUnicode_EncodeUTF7(PyObject *str,
4342 int base64SetO,
4343 int base64WhiteSpace,
4344 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004346 int kind;
4347 void *data;
4348 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004349 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004350 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004352 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 unsigned int base64bits = 0;
4354 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355 char * out;
4356 char * start;
4357
Benjamin Petersonbac79492012-01-14 13:34:47 -05004358 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004359 return NULL;
4360 kind = PyUnicode_KIND(str);
4361 data = PyUnicode_DATA(str);
4362 len = PyUnicode_GET_LENGTH(str);
4363
4364 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004367 /* It might be possible to tighten this worst case */
4368 allocated = 8 * len;
4369 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004370 return PyErr_NoMemory();
4371
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373 if (v == NULL)
4374 return NULL;
4375
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004376 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004377 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 if (inShift) {
4381 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4382 /* shifting out */
4383 if (base64bits) { /* output remaining bits */
4384 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4385 base64buffer = 0;
4386 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 }
4388 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 /* Characters not in the BASE64 set implicitly unshift the sequence
4390 so no '-' is required, except if the character is itself a '-' */
4391 if (IS_BASE64(ch) || ch == '-') {
4392 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 *out++ = (char) ch;
4395 }
4396 else {
4397 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 else { /* not in a shift sequence */
4401 if (ch == '+') {
4402 *out++ = '+';
4403 *out++ = '-';
4404 }
4405 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4406 *out++ = (char) ch;
4407 }
4408 else {
4409 *out++ = '+';
4410 inShift = 1;
4411 goto encode_char;
4412 }
4413 }
4414 continue;
4415encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004417 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004418
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 /* code first surrogate */
4420 base64bits += 16;
4421 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4422 while (base64bits >= 6) {
4423 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4424 base64bits -= 6;
4425 }
4426 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004427 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 base64bits += 16;
4430 base64buffer = (base64buffer << 16) | ch;
4431 while (base64bits >= 6) {
4432 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4433 base64bits -= 6;
4434 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004435 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 if (base64bits)
4437 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4438 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004440 if (_PyBytes_Resize(&v, out - start) < 0)
4441 return NULL;
4442 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004444PyObject *
4445PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4446 Py_ssize_t size,
4447 int base64SetO,
4448 int base64WhiteSpace,
4449 const char *errors)
4450{
4451 PyObject *result;
4452 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4453 if (tmp == NULL)
4454 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004455 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004456 base64WhiteSpace, errors);
4457 Py_DECREF(tmp);
4458 return result;
4459}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461#undef IS_BASE64
4462#undef FROM_BASE64
4463#undef TO_BASE64
4464#undef DECODE_DIRECT
4465#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467/* --- UTF-8 Codec -------------------------------------------------------- */
4468
Tim Petersced69f82003-09-16 20:30:58 +00004469static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004471 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4472 illegal prefix. See RFC 3629 for details */
4473 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4474 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4478 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004480 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4485 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4486 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4487 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4488 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489};
4490
Alexander Belopolsky40018472011-02-26 01:02:56 +00004491PyObject *
4492PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004493 Py_ssize_t size,
4494 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495{
Walter Dörwald69652032004-09-07 20:24:22 +00004496 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4497}
4498
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004499#include "stringlib/ucs1lib.h"
4500#include "stringlib/codecs.h"
4501#include "stringlib/undef.h"
4502
4503#include "stringlib/ucs2lib.h"
4504#include "stringlib/codecs.h"
4505#include "stringlib/undef.h"
4506
4507#include "stringlib/ucs4lib.h"
4508#include "stringlib/codecs.h"
4509#include "stringlib/undef.h"
4510
Antoine Pitrouab868312009-01-10 15:40:25 +00004511/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4512#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4513
4514/* Mask to quickly check whether a C 'long' contains a
4515 non-ASCII, UTF8-encoded char. */
4516#if (SIZEOF_LONG == 8)
4517# define ASCII_CHAR_MASK 0x8080808080808080L
4518#elif (SIZEOF_LONG == 4)
4519# define ASCII_CHAR_MASK 0x80808080L
4520#else
4521# error C 'long' size should be either 4 or 8!
4522#endif
4523
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004524/* Scans a UTF-8 string and returns the maximum character to be expected
4525 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004526
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004527 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004528 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004529 */
4530static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004531utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004533 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004534 const unsigned char *end = p + string_size;
4535 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004536
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004537 assert(unicode_size != NULL);
4538
4539 /* By having a cascade of independent loops which fallback onto each
4540 other, we minimize the amount of work done in the average loop
4541 iteration, and we also maximize the CPU's ability to predict
4542 branches correctly (because a given condition will have always the
4543 same boolean outcome except perhaps in the last iteration of the
4544 corresponding loop).
4545 In the general case this brings us rather close to decoding
4546 performance pre-PEP 393, despite the two-pass decoding.
4547
4548 Note that the pure ASCII loop is not duplicated once a non-ASCII
4549 character has been encountered. It is actually a pessimization (by
4550 a significant factor) to use this loop on text with many non-ASCII
4551 characters, and it is important to avoid bad performance on valid
4552 utf-8 data (invalid utf-8 being a different can of worms).
4553 */
4554
4555 /* ASCII */
4556 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004557 /* Only check value if it's not a ASCII char... */
4558 if (*p < 0x80) {
4559 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4560 an explanation. */
4561 if (!((size_t) p & LONG_PTR_MASK)) {
4562 /* Help register allocation */
4563 register const unsigned char *_p = p;
4564 while (_p < aligned_end) {
4565 unsigned long value = *(unsigned long *) _p;
4566 if (value & ASCII_CHAR_MASK)
4567 break;
4568 _p += SIZEOF_LONG;
4569 char_count += SIZEOF_LONG;
4570 }
4571 p = _p;
4572 if (p == end)
4573 break;
4574 }
4575 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004576 if (*p < 0x80)
4577 ++char_count;
4578 else
4579 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004580 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004581 *unicode_size = char_count;
4582 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004584_ucs1loop:
4585 for (; p < end; ++p) {
4586 if (*p < 0xc4)
4587 char_count += ((*p & 0xc0) != 0x80);
4588 else
4589 goto _ucs2loop;
4590 }
4591 *unicode_size = char_count;
4592 return 255;
4593
4594_ucs2loop:
4595 for (; p < end; ++p) {
4596 if (*p < 0xf0)
4597 char_count += ((*p & 0xc0) != 0x80);
4598 else
4599 goto _ucs4loop;
4600 }
4601 *unicode_size = char_count;
4602 return 65535;
4603
4604_ucs4loop:
4605 for (; p < end; ++p) {
4606 char_count += ((*p & 0xc0) != 0x80);
4607 }
4608 *unicode_size = char_count;
4609 return 65537;
4610}
4611
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004612/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004613 in case of errors. Implicit parameters: unicode, kind, data, onError.
4614 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004615*/
Victor Stinner785938e2011-12-11 20:09:03 +01004616#define WRITE_MAYBE_FAIL(index, value) \
4617 do { \
4618 Py_ssize_t pos = index; \
4619 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4620 unicode_resize(&unicode, pos + pos/8) < 0) \
4621 goto onError; \
4622 if (unicode_putchar(&unicode, &pos, value) < 0) \
4623 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004624 } while (0)
4625
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004626static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004627decode_utf8_errors(const char *starts,
4628 Py_ssize_t size,
4629 const char *errors,
4630 Py_ssize_t *consumed,
4631 const char *s,
4632 PyObject *unicode,
4633 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004634{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004636 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004637 Py_ssize_t startinpos;
4638 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004639 const char *e = starts + size;
4640 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004641 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 PyObject *errorHandler = NULL;
4643 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004644
Antoine Pitrouab868312009-01-10 15:40:25 +00004645 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646
4647 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004648 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649
4650 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004651 /* Fast path for runs of ASCII characters. Given that common UTF-8
4652 input will consist of an overwhelming majority of ASCII
4653 characters, we try to optimize for this case by checking
4654 as many characters as a C 'long' can contain.
4655 First, check if we can do an aligned read, as most CPUs have
4656 a penalty for unaligned reads.
4657 */
4658 if (!((size_t) s & LONG_PTR_MASK)) {
4659 /* Help register allocation */
4660 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004661 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004662 while (_s < aligned_end) {
4663 /* Read a whole long at a time (either 4 or 8 bytes),
4664 and do a fast unrolled copy if it only contains ASCII
4665 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004666 unsigned long value = *(unsigned long *) _s;
4667 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004668 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004669 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4670 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4671 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4672 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004673#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004674 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4675 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4676 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4677 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004678#endif
4679 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004681 }
4682 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004683 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004684 if (s == e)
4685 break;
4686 ch = (unsigned char)*s;
4687 }
4688 }
4689
4690 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004691 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 s++;
4693 continue;
4694 }
4695
4696 n = utf8_code_length[ch];
4697
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004698 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004699 if (consumed)
4700 break;
4701 else {
4702 errmsg = "unexpected end of data";
4703 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004704 endinpos = startinpos+1;
4705 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4706 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 goto utf8Error;
4708 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710
4711 switch (n) {
4712
4713 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004714 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 startinpos = s-starts;
4716 endinpos = startinpos+1;
4717 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
4719 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004720 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 startinpos = s-starts;
4722 endinpos = startinpos+1;
4723 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724
4725 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004726 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004727 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004729 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 goto utf8Error;
4731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004733 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004734 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 break;
4736
4737 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004738 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4739 will result in surrogates in range d800-dfff. Surrogates are
4740 not valid UTF-8 so they are rejected.
4741 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4742 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004743 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004744 (s[2] & 0xc0) != 0x80 ||
4745 ((unsigned char)s[0] == 0xE0 &&
4746 (unsigned char)s[1] < 0xA0) ||
4747 ((unsigned char)s[0] == 0xED &&
4748 (unsigned char)s[1] > 0x9F)) {
4749 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004751 endinpos = startinpos + 1;
4752
4753 /* if s[1] first two bits are 1 and 0, then the invalid
4754 continuation byte is s[2], so increment endinpos by 1,
4755 if not, s[1] is invalid and endinpos doesn't need to
4756 be incremented. */
4757 if ((s[1] & 0xC0) == 0x80)
4758 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 goto utf8Error;
4760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004762 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004763 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004764 break;
4765
4766 case 4:
4767 if ((s[1] & 0xc0) != 0x80 ||
4768 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004769 (s[3] & 0xc0) != 0x80 ||
4770 ((unsigned char)s[0] == 0xF0 &&
4771 (unsigned char)s[1] < 0x90) ||
4772 ((unsigned char)s[0] == 0xF4 &&
4773 (unsigned char)s[1] > 0x8F)) {
4774 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004776 endinpos = startinpos + 1;
4777 if ((s[1] & 0xC0) == 0x80) {
4778 endinpos++;
4779 if ((s[2] & 0xC0) == 0x80)
4780 endinpos++;
4781 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 goto utf8Error;
4783 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004784 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004785 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004786 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004787
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004788 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 }
4791 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004793
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 if (unicode_decode_call_errorhandler(
4796 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004797 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004799 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004801 /* Update data because unicode_decode_call_errorhandler might have
4802 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 }
Walter Dörwald69652032004-09-07 20:24:22 +00004805 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004806 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 /* Adjust length and ready string when it contained errors and
4809 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004810 if (unicode_resize(&unicode, i) < 0)
4811 goto onError;
4812 unicode_adjust_maxchar(&unicode);
4813 if (unicode == NULL)
4814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 Py_XDECREF(errorHandler);
4817 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004818 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004819 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 Py_XDECREF(errorHandler);
4823 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004824 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 return NULL;
4826}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004827#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004828
Victor Stinner785938e2011-12-11 20:09:03 +01004829PyObject *
4830PyUnicode_DecodeUTF8Stateful(const char *s,
4831 Py_ssize_t size,
4832 const char *errors,
4833 Py_ssize_t *consumed)
4834{
4835 Py_UCS4 maxchar = 0;
4836 Py_ssize_t unicode_size;
4837 int has_errors = 0;
4838 PyObject *unicode;
4839 int kind;
4840 void *data;
4841 const char *starts = s;
4842 const char *e;
4843 Py_ssize_t i;
4844
4845 if (size == 0) {
4846 if (consumed)
4847 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004848 Py_INCREF(unicode_empty);
4849 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004850 }
4851
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004852 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004853
4854 /* When the string is ASCII only, just use memcpy and return.
4855 unicode_size may be != size if there is an incomplete UTF-8
4856 sequence at the end of the ASCII block. */
4857 if (maxchar < 128 && size == unicode_size) {
4858 if (consumed)
4859 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004860 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004861 }
4862
4863 unicode = PyUnicode_New(unicode_size, maxchar);
4864 if (!unicode)
4865 return NULL;
4866 kind = PyUnicode_KIND(unicode);
4867 data = PyUnicode_DATA(unicode);
4868
4869 /* Unpack UTF-8 encoded data */
4870 i = 0;
4871 e = starts + size;
4872 switch (kind) {
4873 case PyUnicode_1BYTE_KIND:
4874 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4875 break;
4876 case PyUnicode_2BYTE_KIND:
4877 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4878 break;
4879 case PyUnicode_4BYTE_KIND:
4880 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4881 break;
4882 }
4883 if (!has_errors) {
4884 /* Ensure the unicode size calculation was correct */
4885 assert(i == unicode_size);
4886 assert(s == e);
4887 if (consumed)
4888 *consumed = size;
4889 return unicode;
4890 }
4891
4892 /* In case of errors, maxchar and size computation might be incorrect;
4893 code below refits and resizes as necessary. */
4894 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4895}
4896
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004897#ifdef __APPLE__
4898
4899/* Simplified UTF-8 decoder using surrogateescape error handler,
4900 used to decode the command line arguments on Mac OS X. */
4901
4902wchar_t*
4903_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4904{
4905 int n;
4906 const char *e;
4907 wchar_t *unicode, *p;
4908
4909 /* Note: size will always be longer than the resulting Unicode
4910 character count */
4911 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4912 PyErr_NoMemory();
4913 return NULL;
4914 }
4915 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4916 if (!unicode)
4917 return NULL;
4918
4919 /* Unpack UTF-8 encoded data */
4920 p = unicode;
4921 e = s + size;
4922 while (s < e) {
4923 Py_UCS4 ch = (unsigned char)*s;
4924
4925 if (ch < 0x80) {
4926 *p++ = (wchar_t)ch;
4927 s++;
4928 continue;
4929 }
4930
4931 n = utf8_code_length[ch];
4932 if (s + n > e) {
4933 goto surrogateescape;
4934 }
4935
4936 switch (n) {
4937 case 0:
4938 case 1:
4939 goto surrogateescape;
4940
4941 case 2:
4942 if ((s[1] & 0xc0) != 0x80)
4943 goto surrogateescape;
4944 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4945 assert ((ch > 0x007F) && (ch <= 0x07FF));
4946 *p++ = (wchar_t)ch;
4947 break;
4948
4949 case 3:
4950 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4951 will result in surrogates in range d800-dfff. Surrogates are
4952 not valid UTF-8 so they are rejected.
4953 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4954 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4955 if ((s[1] & 0xc0) != 0x80 ||
4956 (s[2] & 0xc0) != 0x80 ||
4957 ((unsigned char)s[0] == 0xE0 &&
4958 (unsigned char)s[1] < 0xA0) ||
4959 ((unsigned char)s[0] == 0xED &&
4960 (unsigned char)s[1] > 0x9F)) {
4961
4962 goto surrogateescape;
4963 }
4964 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4965 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004966 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004967 break;
4968
4969 case 4:
4970 if ((s[1] & 0xc0) != 0x80 ||
4971 (s[2] & 0xc0) != 0x80 ||
4972 (s[3] & 0xc0) != 0x80 ||
4973 ((unsigned char)s[0] == 0xF0 &&
4974 (unsigned char)s[1] < 0x90) ||
4975 ((unsigned char)s[0] == 0xF4 &&
4976 (unsigned char)s[1] > 0x8F)) {
4977 goto surrogateescape;
4978 }
4979 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4980 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004981 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004982
4983#if SIZEOF_WCHAR_T == 4
4984 *p++ = (wchar_t)ch;
4985#else
4986 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004987 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4988 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004989#endif
4990 break;
4991 }
4992 s += n;
4993 continue;
4994
4995 surrogateescape:
4996 *p++ = 0xDC00 + ch;
4997 s++;
4998 }
4999 *p = L'\0';
5000 return unicode;
5001}
5002
5003#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005005/* Primary internal function which creates utf8 encoded bytes objects.
5006
5007 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005008 and allocate exactly as much space needed at the end. Else allocate the
5009 maximum possible needed (4 result bytes per Unicode character), and return
5010 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005011*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005012PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005013_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014{
Victor Stinner6099a032011-12-18 14:22:26 +01005015 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005016 void *data;
5017 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005019 if (!PyUnicode_Check(unicode)) {
5020 PyErr_BadArgument();
5021 return NULL;
5022 }
5023
5024 if (PyUnicode_READY(unicode) == -1)
5025 return NULL;
5026
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005027 if (PyUnicode_UTF8(unicode))
5028 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5029 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005030
5031 kind = PyUnicode_KIND(unicode);
5032 data = PyUnicode_DATA(unicode);
5033 size = PyUnicode_GET_LENGTH(unicode);
5034
Benjamin Petersonead6b532011-12-20 17:23:42 -06005035 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005036 default:
5037 assert(0);
5038 case PyUnicode_1BYTE_KIND:
5039 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5040 assert(!PyUnicode_IS_ASCII(unicode));
5041 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5042 case PyUnicode_2BYTE_KIND:
5043 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5044 case PyUnicode_4BYTE_KIND:
5045 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047}
5048
Alexander Belopolsky40018472011-02-26 01:02:56 +00005049PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005050PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5051 Py_ssize_t size,
5052 const char *errors)
5053{
5054 PyObject *v, *unicode;
5055
5056 unicode = PyUnicode_FromUnicode(s, size);
5057 if (unicode == NULL)
5058 return NULL;
5059 v = _PyUnicode_AsUTF8String(unicode, errors);
5060 Py_DECREF(unicode);
5061 return v;
5062}
5063
5064PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005065PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005067 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068}
5069
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070/* --- UTF-32 Codec ------------------------------------------------------- */
5071
5072PyObject *
5073PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 Py_ssize_t size,
5075 const char *errors,
5076 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077{
5078 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5079}
5080
5081PyObject *
5082PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 Py_ssize_t size,
5084 const char *errors,
5085 int *byteorder,
5086 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087{
5088 const char *starts = s;
5089 Py_ssize_t startinpos;
5090 Py_ssize_t endinpos;
5091 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005092 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005093 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 int bo = 0; /* assume native ordering by default */
5095 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 /* Offsets from q for retrieving bytes in the right order. */
5097#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5098 int iorder[] = {0, 1, 2, 3};
5099#else
5100 int iorder[] = {3, 2, 1, 0};
5101#endif
5102 PyObject *errorHandler = NULL;
5103 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005104
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105 q = (unsigned char *)s;
5106 e = q + size;
5107
5108 if (byteorder)
5109 bo = *byteorder;
5110
5111 /* Check for BOM marks (U+FEFF) in the input and adjust current
5112 byte order setting accordingly. In native mode, the leading BOM
5113 mark is skipped, in all other modes, it is copied to the output
5114 stream as-is (giving a ZWNBSP character). */
5115 if (bo == 0) {
5116 if (size >= 4) {
5117 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 if (bom == 0x0000FEFF) {
5121 q += 4;
5122 bo = -1;
5123 }
5124 else if (bom == 0xFFFE0000) {
5125 q += 4;
5126 bo = 1;
5127 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 if (bom == 0x0000FEFF) {
5130 q += 4;
5131 bo = 1;
5132 }
5133 else if (bom == 0xFFFE0000) {
5134 q += 4;
5135 bo = -1;
5136 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005139 }
5140
5141 if (bo == -1) {
5142 /* force LE */
5143 iorder[0] = 0;
5144 iorder[1] = 1;
5145 iorder[2] = 2;
5146 iorder[3] = 3;
5147 }
5148 else if (bo == 1) {
5149 /* force BE */
5150 iorder[0] = 3;
5151 iorder[1] = 2;
5152 iorder[2] = 1;
5153 iorder[3] = 0;
5154 }
5155
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005156 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005157 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005158 if (!unicode)
5159 return NULL;
5160 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005161 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005162 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005163
Walter Dörwald41980ca2007-08-16 21:55:45 +00005164 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 Py_UCS4 ch;
5166 /* remaining bytes at the end? (size should be divisible by 4) */
5167 if (e-q<4) {
5168 if (consumed)
5169 break;
5170 errmsg = "truncated data";
5171 startinpos = ((const char *)q)-starts;
5172 endinpos = ((const char *)e)-starts;
5173 goto utf32Error;
5174 /* The remaining input chars are ignored if the callback
5175 chooses to skip the input */
5176 }
5177 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5178 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 if (ch >= 0x110000)
5181 {
5182 errmsg = "codepoint not in range(0x110000)";
5183 startinpos = ((const char *)q)-starts;
5184 endinpos = startinpos+4;
5185 goto utf32Error;
5186 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005187 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5188 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 q += 4;
5190 continue;
5191 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 if (unicode_decode_call_errorhandler(
5193 errors, &errorHandler,
5194 "utf32", errmsg,
5195 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005196 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005198 }
5199
5200 if (byteorder)
5201 *byteorder = bo;
5202
5203 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005205
5206 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005207 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005208 goto onError;
5209
5210 Py_XDECREF(errorHandler);
5211 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005212 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005213
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005215 Py_DECREF(unicode);
5216 Py_XDECREF(errorHandler);
5217 Py_XDECREF(exc);
5218 return NULL;
5219}
5220
5221PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005222_PyUnicode_EncodeUTF32(PyObject *str,
5223 const char *errors,
5224 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005226 int kind;
5227 void *data;
5228 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005229 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005231 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 /* Offsets from p for storing byte pairs in the right order. */
5233#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5234 int iorder[] = {0, 1, 2, 3};
5235#else
5236 int iorder[] = {3, 2, 1, 0};
5237#endif
5238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239#define STORECHAR(CH) \
5240 do { \
5241 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5242 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5243 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5244 p[iorder[0]] = (CH) & 0xff; \
5245 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246 } while(0)
5247
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005248 if (!PyUnicode_Check(str)) {
5249 PyErr_BadArgument();
5250 return NULL;
5251 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005252 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005253 return NULL;
5254 kind = PyUnicode_KIND(str);
5255 data = PyUnicode_DATA(str);
5256 len = PyUnicode_GET_LENGTH(str);
5257
5258 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005259 bytesize = nsize * 4;
5260 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005262 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005263 if (v == NULL)
5264 return NULL;
5265
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005266 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005270 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005271
5272 if (byteorder == -1) {
5273 /* force LE */
5274 iorder[0] = 0;
5275 iorder[1] = 1;
5276 iorder[2] = 2;
5277 iorder[3] = 3;
5278 }
5279 else if (byteorder == 1) {
5280 /* force BE */
5281 iorder[0] = 3;
5282 iorder[1] = 2;
5283 iorder[2] = 1;
5284 iorder[3] = 0;
5285 }
5286
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005287 for (i = 0; i < len; i++)
5288 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005289
5290 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005291 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005292#undef STORECHAR
5293}
5294
Alexander Belopolsky40018472011-02-26 01:02:56 +00005295PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005296PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5297 Py_ssize_t size,
5298 const char *errors,
5299 int byteorder)
5300{
5301 PyObject *result;
5302 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5303 if (tmp == NULL)
5304 return NULL;
5305 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5306 Py_DECREF(tmp);
5307 return result;
5308}
5309
5310PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005311PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312{
Victor Stinnerb960b342011-11-20 19:12:52 +01005313 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314}
5315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316/* --- UTF-16 Codec ------------------------------------------------------- */
5317
Tim Peters772747b2001-08-09 22:21:55 +00005318PyObject *
5319PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 Py_ssize_t size,
5321 const char *errors,
5322 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323{
Walter Dörwald69652032004-09-07 20:24:22 +00005324 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5325}
5326
Antoine Pitrouab868312009-01-10 15:40:25 +00005327/* Two masks for fast checking of whether a C 'long' may contain
5328 UTF16-encoded surrogate characters. This is an efficient heuristic,
5329 assuming that non-surrogate characters with a code point >= 0x8000 are
5330 rare in most input.
5331 FAST_CHAR_MASK is used when the input is in native byte ordering,
5332 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005333*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005334#if (SIZEOF_LONG == 8)
5335# define FAST_CHAR_MASK 0x8000800080008000L
5336# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5337#elif (SIZEOF_LONG == 4)
5338# define FAST_CHAR_MASK 0x80008000L
5339# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5340#else
5341# error C 'long' size should be either 4 or 8!
5342#endif
5343
Walter Dörwald69652032004-09-07 20:24:22 +00005344PyObject *
5345PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 Py_ssize_t size,
5347 const char *errors,
5348 int *byteorder,
5349 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t startinpos;
5353 Py_ssize_t endinpos;
5354 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005355 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005356 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005357 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005358 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005359 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005360 /* Offsets from q for retrieving byte pairs in the right order. */
5361#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5362 int ihi = 1, ilo = 0;
5363#else
5364 int ihi = 0, ilo = 1;
5365#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 PyObject *errorHandler = NULL;
5367 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368
5369 /* Note: size will always be longer than the resulting Unicode
5370 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005371 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 if (!unicode)
5373 return NULL;
5374 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005375 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005376 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
Tim Peters772747b2001-08-09 22:21:55 +00005378 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005379 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380
5381 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005382 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005384 /* Check for BOM marks (U+FEFF) in the input and adjust current
5385 byte order setting accordingly. In native mode, the leading BOM
5386 mark is skipped, in all other modes, it is copied to the output
5387 stream as-is (giving a ZWNBSP character). */
5388 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005389 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005390 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005391#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 if (bom == 0xFEFF) {
5393 q += 2;
5394 bo = -1;
5395 }
5396 else if (bom == 0xFFFE) {
5397 q += 2;
5398 bo = 1;
5399 }
Tim Petersced69f82003-09-16 20:30:58 +00005400#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 if (bom == 0xFEFF) {
5402 q += 2;
5403 bo = 1;
5404 }
5405 else if (bom == 0xFFFE) {
5406 q += 2;
5407 bo = -1;
5408 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005409#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412
Tim Peters772747b2001-08-09 22:21:55 +00005413 if (bo == -1) {
5414 /* force LE */
5415 ihi = 1;
5416 ilo = 0;
5417 }
5418 else if (bo == 1) {
5419 /* force BE */
5420 ihi = 0;
5421 ilo = 1;
5422 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5424 native_ordering = ilo < ihi;
5425#else
5426 native_ordering = ilo > ihi;
5427#endif
Tim Peters772747b2001-08-09 22:21:55 +00005428
Antoine Pitrouab868312009-01-10 15:40:25 +00005429 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005430 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005431 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005432 /* First check for possible aligned read of a C 'long'. Unaligned
5433 reads are more expensive, better to defer to another iteration. */
5434 if (!((size_t) q & LONG_PTR_MASK)) {
5435 /* Fast path for runs of non-surrogate chars. */
5436 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005437 int kind = PyUnicode_KIND(unicode);
5438 void *data = PyUnicode_DATA(unicode);
5439 while (_q < aligned_end) {
5440 unsigned long block = * (unsigned long *) _q;
5441 unsigned short *pblock = (unsigned short*)&block;
5442 Py_UCS4 maxch;
5443 if (native_ordering) {
5444 /* Can use buffer directly */
5445 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005446 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005447 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005448 else {
5449 /* Need to byte-swap */
5450 unsigned char *_p = (unsigned char*)pblock;
5451 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005452 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005453 _p[0] = _q[1];
5454 _p[1] = _q[0];
5455 _p[2] = _q[3];
5456 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005457#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005458 _p[4] = _q[5];
5459 _p[5] = _q[4];
5460 _p[6] = _q[7];
5461 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005462#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005463 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005464 maxch = Py_MAX(pblock[0], pblock[1]);
5465#if SIZEOF_LONG == 8
5466 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5467#endif
5468 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5469 if (unicode_widen(&unicode, maxch) < 0)
5470 goto onError;
5471 kind = PyUnicode_KIND(unicode);
5472 data = PyUnicode_DATA(unicode);
5473 }
5474 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5475 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5476#if SIZEOF_LONG == 8
5477 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5478 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5479#endif
5480 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005481 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005482 q = _q;
5483 if (q >= e)
5484 break;
5485 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487
Benjamin Peterson14339b62009-01-31 16:36:08 +00005488 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005489
Victor Stinner551ac952011-11-29 22:58:13 +01005490 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005491 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5492 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 continue;
5494 }
5495
5496 /* UTF-16 code pair: */
5497 if (q > e) {
5498 errmsg = "unexpected end of data";
5499 startinpos = (((const char *)q) - 2) - starts;
5500 endinpos = ((const char *)e) + 1 - starts;
5501 goto utf16Error;
5502 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005503 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5504 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005506 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005507 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005508 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005509 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 continue;
5511 }
5512 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005513 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 startinpos = (((const char *)q)-4)-starts;
5515 endinpos = startinpos+2;
5516 goto utf16Error;
5517 }
5518
Benjamin Peterson14339b62009-01-31 16:36:08 +00005519 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 errmsg = "illegal encoding";
5521 startinpos = (((const char *)q)-2)-starts;
5522 endinpos = startinpos+2;
5523 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005524
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005527 errors,
5528 &errorHandler,
5529 "utf16", errmsg,
5530 &starts,
5531 (const char **)&e,
5532 &startinpos,
5533 &endinpos,
5534 &exc,
5535 (const char **)&q,
5536 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005537 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005540 /* remaining byte at the end? (size should be even) */
5541 if (e == q) {
5542 if (!consumed) {
5543 errmsg = "truncated data";
5544 startinpos = ((const char *)q) - starts;
5545 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005546 if (unicode_decode_call_errorhandler(
5547 errors,
5548 &errorHandler,
5549 "utf16", errmsg,
5550 &starts,
5551 (const char **)&e,
5552 &startinpos,
5553 &endinpos,
5554 &exc,
5555 (const char **)&q,
5556 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005557 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005558 goto onError;
5559 /* The remaining input chars are ignored if the callback
5560 chooses to skip the input */
5561 }
5562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
5564 if (byteorder)
5565 *byteorder = bo;
5566
Walter Dörwald69652032004-09-07 20:24:22 +00005567 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005569
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005571 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 goto onError;
5573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 Py_XDECREF(errorHandler);
5575 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005576 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 Py_XDECREF(errorHandler);
5581 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 return NULL;
5583}
5584
Antoine Pitrouab868312009-01-10 15:40:25 +00005585#undef FAST_CHAR_MASK
5586#undef SWAPPED_FAST_CHAR_MASK
5587
Tim Peters772747b2001-08-09 22:21:55 +00005588PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005589_PyUnicode_EncodeUTF16(PyObject *str,
5590 const char *errors,
5591 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005593 int kind;
5594 void *data;
5595 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005596 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005597 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005598 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005599 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005600 /* Offsets from p for storing byte pairs in the right order. */
5601#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5602 int ihi = 1, ilo = 0;
5603#else
5604 int ihi = 0, ilo = 1;
5605#endif
5606
Benjamin Peterson29060642009-01-31 22:14:21 +00005607#define STORECHAR(CH) \
5608 do { \
5609 p[ihi] = ((CH) >> 8) & 0xff; \
5610 p[ilo] = (CH) & 0xff; \
5611 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005612 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005614 if (!PyUnicode_Check(str)) {
5615 PyErr_BadArgument();
5616 return NULL;
5617 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005618 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 return NULL;
5620 kind = PyUnicode_KIND(str);
5621 data = PyUnicode_DATA(str);
5622 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005623
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005624 pairs = 0;
5625 if (kind == PyUnicode_4BYTE_KIND)
5626 for (i = 0; i < len; i++)
5627 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5628 pairs++;
5629 /* 2 * (len + pairs + (byteorder == 0)) */
5630 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005632 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005633 bytesize = nsize * 2;
5634 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005636 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 if (v == NULL)
5638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005640 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005644 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005645
5646 if (byteorder == -1) {
5647 /* force LE */
5648 ihi = 1;
5649 ilo = 0;
5650 }
5651 else if (byteorder == 1) {
5652 /* force BE */
5653 ihi = 0;
5654 ilo = 1;
5655 }
5656
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 for (i = 0; i < len; i++) {
5658 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5659 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005661 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5662 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 }
Tim Peters772747b2001-08-09 22:21:55 +00005664 STORECHAR(ch);
5665 if (ch2)
5666 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005667 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005668
5669 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005670 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005671#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672}
5673
Alexander Belopolsky40018472011-02-26 01:02:56 +00005674PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005675PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5676 Py_ssize_t size,
5677 const char *errors,
5678 int byteorder)
5679{
5680 PyObject *result;
5681 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5682 if (tmp == NULL)
5683 return NULL;
5684 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5685 Py_DECREF(tmp);
5686 return result;
5687}
5688
5689PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005690PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005692 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693}
5694
5695/* --- Unicode Escape Codec ----------------------------------------------- */
5696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5698 if all the escapes in the string make it still a valid ASCII string.
5699 Returns -1 if any escapes were found which cause the string to
5700 pop out of ASCII range. Otherwise returns the length of the
5701 required buffer to hold the string.
5702 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005703static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005704length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5705{
5706 const unsigned char *p = (const unsigned char *)s;
5707 const unsigned char *end = p + size;
5708 Py_ssize_t length = 0;
5709
5710 if (size < 0)
5711 return -1;
5712
5713 for (; p < end; ++p) {
5714 if (*p > 127) {
5715 /* Non-ASCII */
5716 return -1;
5717 }
5718 else if (*p != '\\') {
5719 /* Normal character */
5720 ++length;
5721 }
5722 else {
5723 /* Backslash-escape, check next char */
5724 ++p;
5725 /* Escape sequence reaches till end of string or
5726 non-ASCII follow-up. */
5727 if (p >= end || *p > 127)
5728 return -1;
5729 switch (*p) {
5730 case '\n':
5731 /* backslash + \n result in zero characters */
5732 break;
5733 case '\\': case '\'': case '\"':
5734 case 'b': case 'f': case 't':
5735 case 'n': case 'r': case 'v': case 'a':
5736 ++length;
5737 break;
5738 case '0': case '1': case '2': case '3':
5739 case '4': case '5': case '6': case '7':
5740 case 'x': case 'u': case 'U': case 'N':
5741 /* these do not guarantee ASCII characters */
5742 return -1;
5743 default:
5744 /* count the backslash + the other character */
5745 length += 2;
5746 }
5747 }
5748 }
5749 return length;
5750}
5751
Fredrik Lundh06d12682001-01-24 07:59:11 +00005752static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005753
Alexander Belopolsky40018472011-02-26 01:02:56 +00005754PyObject *
5755PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005756 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005760 Py_ssize_t startinpos;
5761 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005762 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005763 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005765 char* message;
5766 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 PyObject *errorHandler = NULL;
5768 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005769 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005770 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005771
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005772 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005773
5774 /* After length_of_escaped_ascii_string() there are two alternatives,
5775 either the string is pure ASCII with named escapes like \n, etc.
5776 and we determined it's exact size (common case)
5777 or it contains \x, \u, ... escape sequences. then we create a
5778 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005779 if (len >= 0) {
5780 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 if (!v)
5782 goto onError;
5783 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005784 }
5785 else {
5786 /* Escaped strings will always be longer than the resulting
5787 Unicode string, so we start with size here and then reduce the
5788 length after conversion to the true value.
5789 (but if the error callback returns a long replacement string
5790 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005791 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005792 if (!v)
5793 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005794 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 }
5796
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005798 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005799 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005801
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 while (s < end) {
5803 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005804 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005807 /* The only case in which i == ascii_length is a backslash
5808 followed by a newline. */
5809 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005810
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 /* Non-escape characters are interpreted as Unicode ordinals */
5812 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005813 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 continue;
5816 }
5817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 /* \ - Escapes */
5820 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005821 c = *s++;
5822 if (s > end)
5823 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005825 /* The only case in which i == ascii_length is a backslash
5826 followed by a newline. */
5827 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005828
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005829 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005832#define WRITECHAR(ch) \
5833 do { \
5834 if (unicode_putchar(&v, &i, ch) < 0) \
5835 goto onError; \
5836 }while(0)
5837
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005839 case '\\': WRITECHAR('\\'); break;
5840 case '\'': WRITECHAR('\''); break;
5841 case '\"': WRITECHAR('\"'); break;
5842 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005843 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005844 case 'f': WRITECHAR('\014'); break;
5845 case 't': WRITECHAR('\t'); break;
5846 case 'n': WRITECHAR('\n'); break;
5847 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005848 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005849 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005850 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005851 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 case '0': case '1': case '2': case '3':
5855 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005856 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005857 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005858 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005859 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005860 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005862 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 break;
5864
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 /* hex escapes */
5866 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005868 digits = 2;
5869 message = "truncated \\xXX escape";
5870 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005874 digits = 4;
5875 message = "truncated \\uXXXX escape";
5876 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005879 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005880 digits = 8;
5881 message = "truncated \\UXXXXXXXX escape";
5882 hexescape:
5883 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 if (s+digits>end) {
5885 endinpos = size;
5886 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 errors, &errorHandler,
5888 "unicodeescape", "end of string in escape sequence",
5889 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005890 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005891 goto onError;
5892 goto nextByte;
5893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005894 for (j = 0; j < digits; ++j) {
5895 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005896 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005897 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 errors, &errorHandler,
5900 "unicodeescape", message,
5901 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005902 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005903 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005906 }
5907 chr = (chr<<4) & ~0xF;
5908 if (c >= '0' && c <= '9')
5909 chr += c - '0';
5910 else if (c >= 'a' && c <= 'f')
5911 chr += 10 + c - 'a';
5912 else
5913 chr += 10 + c - 'A';
5914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005915 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005916 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 /* _decoding_error will have already written into the
5918 target buffer. */
5919 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005920 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005921 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005922 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005923 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005924 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 errors, &errorHandler,
5928 "unicodeescape", "illegal Unicode character",
5929 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005931 goto onError;
5932 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005933 break;
5934
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005936 case 'N':
5937 message = "malformed \\N character escape";
5938 if (ucnhash_CAPI == NULL) {
5939 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005940 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5941 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005942 if (ucnhash_CAPI == NULL)
5943 goto ucnhashError;
5944 }
5945 if (*s == '{') {
5946 const char *start = s+1;
5947 /* look for the closing brace */
5948 while (*s != '}' && s < end)
5949 s++;
5950 if (s > start && s < end && *s == '}') {
5951 /* found a name. look it up in the unicode database */
5952 message = "unknown Unicode character name";
5953 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005954 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005955 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005956 goto store;
5957 }
5958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 errors, &errorHandler,
5962 "unicodeescape", message,
5963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005964 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005965 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005966 break;
5967
5968 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005969 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 message = "\\ at end of string";
5971 s--;
5972 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005973 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 errors, &errorHandler,
5975 "unicodeescape", message,
5976 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005977 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005978 goto onError;
5979 }
5980 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005981 WRITECHAR('\\');
5982 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005983 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005984 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005989#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990
Victor Stinner16e6a802011-12-12 13:24:15 +01005991 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005992 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005993 Py_XDECREF(errorHandler);
5994 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005995 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005996
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005998 PyErr_SetString(
5999 PyExc_UnicodeError,
6000 "\\N escapes not supported (can't load unicodedata module)"
6001 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006002 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006003 Py_XDECREF(errorHandler);
6004 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006005 return NULL;
6006
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006009 Py_XDECREF(errorHandler);
6010 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 return NULL;
6012}
6013
6014/* Return a Unicode-Escape string version of the Unicode object.
6015
6016 If quotes is true, the string is enclosed in u"" or u'' quotes as
6017 appropriate.
6018
6019*/
6020
Alexander Belopolsky40018472011-02-26 01:02:56 +00006021PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006022PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006025 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027 int kind;
6028 void *data;
6029 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
Thomas Wouters89f507f2006-12-13 04:49:30 +00006031 /* Initial allocation is based on the longest-possible unichr
6032 escape.
6033
6034 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6035 unichr, so in this case it's the longest unichr escape. In
6036 narrow (UTF-16) builds this is five chars per source unichr
6037 since there are two unichrs in the surrogate pair, so in narrow
6038 (UTF-16) builds it's not the longest unichr escape.
6039
6040 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6041 so in the narrow (UTF-16) build case it's the longest unichr
6042 escape.
6043 */
6044
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006045 if (!PyUnicode_Check(unicode)) {
6046 PyErr_BadArgument();
6047 return NULL;
6048 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006049 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006050 return NULL;
6051 len = PyUnicode_GET_LENGTH(unicode);
6052 kind = PyUnicode_KIND(unicode);
6053 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006054 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006055 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6056 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6057 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6058 }
6059
6060 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006061 return PyBytes_FromStringAndSize(NULL, 0);
6062
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006063 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006065
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006066 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006068 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 if (repr == NULL)
6071 return NULL;
6072
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006073 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006075 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006076 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006077
Walter Dörwald79e913e2007-05-12 11:08:06 +00006078 /* Escape backslashes */
6079 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 *p++ = '\\';
6081 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006082 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006083 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006084
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006085 /* Map 21-bit characters to '\U00xxxxxx' */
6086 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006087 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006088 *p++ = '\\';
6089 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006090 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6091 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6092 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6093 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6094 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6095 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6096 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6097 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006099 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006100
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006102 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 *p++ = '\\';
6104 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006105 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6106 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6107 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6108 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006110
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006111 /* Map special whitespace to '\t', \n', '\r' */
6112 else if (ch == '\t') {
6113 *p++ = '\\';
6114 *p++ = 't';
6115 }
6116 else if (ch == '\n') {
6117 *p++ = '\\';
6118 *p++ = 'n';
6119 }
6120 else if (ch == '\r') {
6121 *p++ = '\\';
6122 *p++ = 'r';
6123 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006124
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006125 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006126 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006128 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006129 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6130 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006131 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 /* Copy everything else as-is */
6134 else
6135 *p++ = (char) ch;
6136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006138 assert(p - PyBytes_AS_STRING(repr) > 0);
6139 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6140 return NULL;
6141 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142}
6143
Alexander Belopolsky40018472011-02-26 01:02:56 +00006144PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006145PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6146 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 PyObject *result;
6149 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6150 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 result = PyUnicode_AsUnicodeEscapeString(tmp);
6153 Py_DECREF(tmp);
6154 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155}
6156
6157/* --- Raw Unicode Escape Codec ------------------------------------------- */
6158
Alexander Belopolsky40018472011-02-26 01:02:56 +00006159PyObject *
6160PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006161 Py_ssize_t size,
6162 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006165 Py_ssize_t startinpos;
6166 Py_ssize_t endinpos;
6167 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006168 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 const char *end;
6170 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 PyObject *errorHandler = NULL;
6172 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006173
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 /* Escaped strings will always be longer than the resulting
6175 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 length after conversion to the true value. (But decoding error
6177 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006178 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006182 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006183 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 end = s + size;
6185 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 unsigned char c;
6187 Py_UCS4 x;
6188 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006189 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* Non-escape characters are interpreted as Unicode ordinals */
6192 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006193 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6194 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 startinpos = s-starts;
6198
6199 /* \u-escapes are only interpreted iff the number of leading
6200 backslashes if odd */
6201 bs = s;
6202 for (;s < end;) {
6203 if (*s != '\\')
6204 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006205 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6206 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 }
6208 if (((s - bs) & 1) == 0 ||
6209 s >= end ||
6210 (*s != 'u' && *s != 'U')) {
6211 continue;
6212 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006213 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 count = *s=='u' ? 4 : 8;
6215 s++;
6216
6217 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 for (x = 0, i = 0; i < count; ++i, ++s) {
6219 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006220 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 endinpos = s-starts;
6222 if (unicode_decode_call_errorhandler(
6223 errors, &errorHandler,
6224 "rawunicodeescape", "truncated \\uXXXX",
6225 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006226 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 goto onError;
6228 goto nextByte;
6229 }
6230 x = (x<<4) & ~0xF;
6231 if (c >= '0' && c <= '9')
6232 x += c - '0';
6233 else if (c >= 'a' && c <= 'f')
6234 x += 10 + c - 'a';
6235 else
6236 x += 10 + c - 'A';
6237 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006238 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006239 if (unicode_putchar(&v, &outpos, x) < 0)
6240 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006241 } else {
6242 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006243 if (unicode_decode_call_errorhandler(
6244 errors, &errorHandler,
6245 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006247 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006249 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 nextByte:
6251 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006253 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006255 Py_XDECREF(errorHandler);
6256 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006257 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006258
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261 Py_XDECREF(errorHandler);
6262 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 return NULL;
6264}
6265
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006266
Alexander Belopolsky40018472011-02-26 01:02:56 +00006267PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006270 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 char *p;
6272 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006273 Py_ssize_t expandsize, pos;
6274 int kind;
6275 void *data;
6276 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278 if (!PyUnicode_Check(unicode)) {
6279 PyErr_BadArgument();
6280 return NULL;
6281 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006282 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006283 return NULL;
6284 kind = PyUnicode_KIND(unicode);
6285 data = PyUnicode_DATA(unicode);
6286 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006287 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6288 bytes, and 1 byte characters 4. */
6289 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006290
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006291 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006293
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006294 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 if (repr == NULL)
6296 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006297 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006298 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006300 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006301 for (pos = 0; pos < len; pos++) {
6302 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 /* Map 32-bit characters to '\Uxxxxxxxx' */
6304 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006305 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006306 *p++ = '\\';
6307 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006308 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6309 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6310 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6311 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6312 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6313 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6314 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6315 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 *p++ = '\\';
6320 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006321 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6322 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6323 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6324 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 /* Copy everything else as-is */
6327 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 *p++ = (char) ch;
6329 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006330
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006331 assert(p > q);
6332 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006333 return NULL;
6334 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006338PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6339 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006341 PyObject *result;
6342 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6343 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006344 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006345 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6346 Py_DECREF(tmp);
6347 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348}
6349
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006350/* --- Unicode Internal Codec ------------------------------------------- */
6351
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352PyObject *
6353_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 Py_ssize_t size,
6355 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006356{
6357 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006358 Py_ssize_t startinpos;
6359 Py_ssize_t endinpos;
6360 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006361 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006362 const char *end;
6363 const char *reason;
6364 PyObject *errorHandler = NULL;
6365 PyObject *exc = NULL;
6366
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006367 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006368 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006369 1))
6370 return NULL;
6371
Thomas Wouters89f507f2006-12-13 04:49:30 +00006372 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006373 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006374 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006376 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006377 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006378 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006379 end = s + size;
6380
6381 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006382 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006383 Py_UCS4 ch;
6384 /* We copy the raw representation one byte at a time because the
6385 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006386 ((char *) &uch)[0] = s[0];
6387 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006388#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006389 ((char *) &uch)[2] = s[2];
6390 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006391#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006392 ch = uch;
6393
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006394 /* We have to sanity check the raw data, otherwise doom looms for
6395 some malformed UCS-4 data. */
6396 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006397#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006398 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006399#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006400 end-s < Py_UNICODE_SIZE
6401 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006403 startinpos = s - starts;
6404 if (end-s < Py_UNICODE_SIZE) {
6405 endinpos = end-starts;
6406 reason = "truncated input";
6407 }
6408 else {
6409 endinpos = s - starts + Py_UNICODE_SIZE;
6410 reason = "illegal code point (> 0x10FFFF)";
6411 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006412 if (unicode_decode_call_errorhandler(
6413 errors, &errorHandler,
6414 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006415 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006416 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006417 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006418 continue;
6419 }
6420
6421 s += Py_UNICODE_SIZE;
6422#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006423 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006424 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006425 Py_UNICODE uch2;
6426 ((char *) &uch2)[0] = s[0];
6427 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006428 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006429 {
Victor Stinner551ac952011-11-29 22:58:13 +01006430 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006431 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006432 }
6433 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006434#endif
6435
6436 if (unicode_putchar(&v, &outpos, ch) < 0)
6437 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006438 }
6439
Victor Stinner16e6a802011-12-12 13:24:15 +01006440 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006441 goto onError;
6442 Py_XDECREF(errorHandler);
6443 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006444 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006445
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006447 Py_XDECREF(v);
6448 Py_XDECREF(errorHandler);
6449 Py_XDECREF(exc);
6450 return NULL;
6451}
6452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453/* --- Latin-1 Codec ------------------------------------------------------ */
6454
Alexander Belopolsky40018472011-02-26 01:02:56 +00006455PyObject *
6456PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006457 Py_ssize_t size,
6458 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006461 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462}
6463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006465static void
6466make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006467 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006468 PyObject *unicode,
6469 Py_ssize_t startpos, Py_ssize_t endpos,
6470 const char *reason)
6471{
6472 if (*exceptionObject == NULL) {
6473 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006475 encoding, unicode, startpos, endpos, reason);
6476 }
6477 else {
6478 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6479 goto onError;
6480 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6481 goto onError;
6482 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6483 goto onError;
6484 return;
6485 onError:
6486 Py_DECREF(*exceptionObject);
6487 *exceptionObject = NULL;
6488 }
6489}
6490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006492static void
6493raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006494 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006495 PyObject *unicode,
6496 Py_ssize_t startpos, Py_ssize_t endpos,
6497 const char *reason)
6498{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006499 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006500 encoding, unicode, startpos, endpos, reason);
6501 if (*exceptionObject != NULL)
6502 PyCodec_StrictErrors(*exceptionObject);
6503}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006504
6505/* error handling callback helper:
6506 build arguments, call the callback and check the arguments,
6507 put the result into newpos and return the replacement string, which
6508 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006509static PyObject *
6510unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006511 PyObject **errorHandler,
6512 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006513 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006514 Py_ssize_t startpos, Py_ssize_t endpos,
6515 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006517 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519 PyObject *restuple;
6520 PyObject *resunicode;
6521
6522 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006524 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 }
6527
Benjamin Petersonbac79492012-01-14 13:34:47 -05006528 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006529 return NULL;
6530 len = PyUnicode_GET_LENGTH(unicode);
6531
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006532 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006534 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536
6537 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006539 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006542 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 Py_DECREF(restuple);
6544 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006545 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006546 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 &resunicode, newpos)) {
6548 Py_DECREF(restuple);
6549 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006551 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6552 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6553 Py_DECREF(restuple);
6554 return NULL;
6555 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006556 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 *newpos = len + *newpos;
6558 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6560 Py_DECREF(restuple);
6561 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006562 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006563 Py_INCREF(resunicode);
6564 Py_DECREF(restuple);
6565 return resunicode;
6566}
6567
Alexander Belopolsky40018472011-02-26 01:02:56 +00006568static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006570 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006571 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006573 /* input state */
6574 Py_ssize_t pos=0, size;
6575 int kind;
6576 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577 /* output object */
6578 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579 /* pointer into the output */
6580 char *str;
6581 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006582 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006583 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6584 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006585 PyObject *errorHandler = NULL;
6586 PyObject *exc = NULL;
6587 /* the following variable is used for caching string comparisons
6588 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6589 int known_errorHandler = -1;
6590
Benjamin Petersonbac79492012-01-14 13:34:47 -05006591 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006592 return NULL;
6593 size = PyUnicode_GET_LENGTH(unicode);
6594 kind = PyUnicode_KIND(unicode);
6595 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596 /* allocate enough for a simple encoding without
6597 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006598 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006599 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006600 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006601 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006602 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006603 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604 ressize = size;
6605
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 while (pos < size) {
6607 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 /* can we encode this? */
6610 if (c<limit) {
6611 /* no overflow check, because we know that the space is enough */
6612 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006614 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 Py_ssize_t requiredsize;
6617 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006618 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 Py_ssize_t collstart = pos;
6621 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 ++collend;
6625 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6626 if (known_errorHandler==-1) {
6627 if ((errors==NULL) || (!strcmp(errors, "strict")))
6628 known_errorHandler = 1;
6629 else if (!strcmp(errors, "replace"))
6630 known_errorHandler = 2;
6631 else if (!strcmp(errors, "ignore"))
6632 known_errorHandler = 3;
6633 else if (!strcmp(errors, "xmlcharrefreplace"))
6634 known_errorHandler = 4;
6635 else
6636 known_errorHandler = 0;
6637 }
6638 switch (known_errorHandler) {
6639 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006640 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 goto onError;
6642 case 2: /* replace */
6643 while (collstart++<collend)
6644 *str++ = '?'; /* fall through */
6645 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 break;
6648 case 4: /* xmlcharrefreplace */
6649 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650 /* determine replacement size */
6651 for (i = collstart, repsize = 0; i < collend; ++i) {
6652 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6653 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006655 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006665 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006666 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 if (requiredsize > ressize) {
6672 if (requiredsize<2*ressize)
6673 requiredsize = 2*ressize;
6674 if (_PyBytes_Resize(&res, requiredsize))
6675 goto onError;
6676 str = PyBytes_AS_STRING(res) + respos;
6677 ressize = requiredsize;
6678 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 /* generate replacement */
6680 for (i = collstart; i < collend; ++i) {
6681 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006683 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 break;
6685 default:
6686 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006687 encoding, reason, unicode, &exc,
6688 collstart, collend, &newpos);
6689 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006690 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006692 if (PyBytes_Check(repunicode)) {
6693 /* Directly copy bytes result to output. */
6694 repsize = PyBytes_Size(repunicode);
6695 if (repsize > 1) {
6696 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006697 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006698 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6699 Py_DECREF(repunicode);
6700 goto onError;
6701 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006702 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006703 ressize += repsize-1;
6704 }
6705 memcpy(str, PyBytes_AsString(repunicode), repsize);
6706 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006707 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006708 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006709 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 /* need more space? (at least enough for what we
6712 have+the replacement+the rest of the string, so
6713 we won't have to check space for encodable characters) */
6714 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715 repsize = PyUnicode_GET_LENGTH(repunicode);
6716 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 if (requiredsize > ressize) {
6718 if (requiredsize<2*ressize)
6719 requiredsize = 2*ressize;
6720 if (_PyBytes_Resize(&res, requiredsize)) {
6721 Py_DECREF(repunicode);
6722 goto onError;
6723 }
6724 str = PyBytes_AS_STRING(res) + respos;
6725 ressize = requiredsize;
6726 }
6727 /* check if there is anything unencodable in the replacement
6728 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006729 for (i = 0; repsize-->0; ++i, ++str) {
6730 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006732 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 Py_DECREF(repunicode);
6735 goto onError;
6736 }
6737 *str = (char)c;
6738 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006739 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006740 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 }
6743 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006744 /* Resize if we allocated to much */
6745 size = str - PyBytes_AS_STRING(res);
6746 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006747 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006748 if (_PyBytes_Resize(&res, size) < 0)
6749 goto onError;
6750 }
6751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 Py_XDECREF(errorHandler);
6753 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006754 return res;
6755
6756 onError:
6757 Py_XDECREF(res);
6758 Py_XDECREF(errorHandler);
6759 Py_XDECREF(exc);
6760 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761}
6762
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006764PyObject *
6765PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006766 Py_ssize_t size,
6767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006769 PyObject *result;
6770 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6771 if (unicode == NULL)
6772 return NULL;
6773 result = unicode_encode_ucs1(unicode, errors, 256);
6774 Py_DECREF(unicode);
6775 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776}
6777
Alexander Belopolsky40018472011-02-26 01:02:56 +00006778PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006779_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780{
6781 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 PyErr_BadArgument();
6783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006785 if (PyUnicode_READY(unicode) == -1)
6786 return NULL;
6787 /* Fast path: if it is a one-byte string, construct
6788 bytes object directly. */
6789 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6790 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6791 PyUnicode_GET_LENGTH(unicode));
6792 /* Non-Latin-1 characters present. Defer to above function to
6793 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006795}
6796
6797PyObject*
6798PyUnicode_AsLatin1String(PyObject *unicode)
6799{
6800 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801}
6802
6803/* --- 7-bit ASCII Codec -------------------------------------------------- */
6804
Alexander Belopolsky40018472011-02-26 01:02:56 +00006805PyObject *
6806PyUnicode_DecodeASCII(const char *s,
6807 Py_ssize_t size,
6808 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006811 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006812 int kind;
6813 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006814 Py_ssize_t startinpos;
6815 Py_ssize_t endinpos;
6816 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006817 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006818 int has_error;
6819 const unsigned char *p = (const unsigned char *)s;
6820 const unsigned char *end = p + size;
6821 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 PyObject *errorHandler = NULL;
6823 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006824
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006825 if (size == 0) {
6826 Py_INCREF(unicode_empty);
6827 return unicode_empty;
6828 }
6829
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006831 if (size == 1 && (unsigned char)s[0] < 128)
6832 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006833
Victor Stinner702c7342011-10-05 13:50:52 +02006834 has_error = 0;
6835 while (p < end && !has_error) {
6836 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6837 an explanation. */
6838 if (!((size_t) p & LONG_PTR_MASK)) {
6839 /* Help register allocation */
6840 register const unsigned char *_p = p;
6841 while (_p < aligned_end) {
6842 unsigned long value = *(unsigned long *) _p;
6843 if (value & ASCII_CHAR_MASK) {
6844 has_error = 1;
6845 break;
6846 }
6847 _p += SIZEOF_LONG;
6848 }
6849 if (_p == end)
6850 break;
6851 if (has_error)
6852 break;
6853 p = _p;
6854 }
6855 if (*p & 0x80) {
6856 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006857 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006858 }
6859 else {
6860 ++p;
6861 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006862 }
Victor Stinner702c7342011-10-05 13:50:52 +02006863 if (!has_error)
6864 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006865
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006866 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006870 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006871 kind = PyUnicode_KIND(v);
6872 data = PyUnicode_DATA(v);
6873 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874 e = s + size;
6875 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 register unsigned char c = (unsigned char)*s;
6877 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006878 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 ++s;
6880 }
6881 else {
6882 startinpos = s-starts;
6883 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 if (unicode_decode_call_errorhandler(
6885 errors, &errorHandler,
6886 "ascii", "ordinal not in range(128)",
6887 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006888 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006890 kind = PyUnicode_KIND(v);
6891 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006894 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006895 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 Py_XDECREF(errorHandler);
6897 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006898 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006899 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006900
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 Py_XDECREF(errorHandler);
6904 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 return NULL;
6906}
6907
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006908/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006909PyObject *
6910PyUnicode_EncodeASCII(const Py_UNICODE *p,
6911 Py_ssize_t size,
6912 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 PyObject *result;
6915 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6916 if (unicode == NULL)
6917 return NULL;
6918 result = unicode_encode_ucs1(unicode, errors, 128);
6919 Py_DECREF(unicode);
6920 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
Alexander Belopolsky40018472011-02-26 01:02:56 +00006923PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006924_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925{
6926 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 PyErr_BadArgument();
6928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006930 if (PyUnicode_READY(unicode) == -1)
6931 return NULL;
6932 /* Fast path: if it is an ASCII-only string, construct bytes object
6933 directly. Else defer to above function to raise the exception. */
6934 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6935 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6936 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006937 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006938}
6939
6940PyObject *
6941PyUnicode_AsASCIIString(PyObject *unicode)
6942{
6943 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944}
6945
Victor Stinner99b95382011-07-04 14:23:54 +02006946#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006947
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006948/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006949
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006950#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006951#define NEED_RETRY
6952#endif
6953
Victor Stinner3a50e702011-10-18 21:21:00 +02006954#ifndef WC_ERR_INVALID_CHARS
6955# define WC_ERR_INVALID_CHARS 0x0080
6956#endif
6957
6958static char*
6959code_page_name(UINT code_page, PyObject **obj)
6960{
6961 *obj = NULL;
6962 if (code_page == CP_ACP)
6963 return "mbcs";
6964 if (code_page == CP_UTF7)
6965 return "CP_UTF7";
6966 if (code_page == CP_UTF8)
6967 return "CP_UTF8";
6968
6969 *obj = PyBytes_FromFormat("cp%u", code_page);
6970 if (*obj == NULL)
6971 return NULL;
6972 return PyBytes_AS_STRING(*obj);
6973}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974
Alexander Belopolsky40018472011-02-26 01:02:56 +00006975static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006976is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006977{
6978 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 if (!IsDBCSLeadByteEx(code_page, *curr))
6982 return 0;
6983
6984 prev = CharPrevExA(code_page, s, curr, 0);
6985 if (prev == curr)
6986 return 1;
6987 /* FIXME: This code is limited to "true" double-byte encodings,
6988 as it assumes an incomplete character consists of a single
6989 byte. */
6990 if (curr - prev == 2)
6991 return 1;
6992 if (!IsDBCSLeadByteEx(code_page, *prev))
6993 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 return 0;
6995}
6996
Victor Stinner3a50e702011-10-18 21:21:00 +02006997static DWORD
6998decode_code_page_flags(UINT code_page)
6999{
7000 if (code_page == CP_UTF7) {
7001 /* The CP_UTF7 decoder only supports flags=0 */
7002 return 0;
7003 }
7004 else
7005 return MB_ERR_INVALID_CHARS;
7006}
7007
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 * Decode a byte string from a Windows code page into unicode object in strict
7010 * mode.
7011 *
7012 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7013 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007015static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007016decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007017 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 const char *in,
7019 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020{
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007022 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007023 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024
7025 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007026 assert(insize > 0);
7027 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7028 if (outsize <= 0)
7029 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030
7031 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007033 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007034 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 if (*v == NULL)
7036 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007037 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038 }
7039 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007041 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007042 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045 }
7046
7047 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007048 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7049 if (outsize <= 0)
7050 goto error;
7051 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007052
Victor Stinner3a50e702011-10-18 21:21:00 +02007053error:
7054 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7055 return -2;
7056 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007057 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058}
7059
Victor Stinner3a50e702011-10-18 21:21:00 +02007060/*
7061 * Decode a byte string from a code page into unicode object with an error
7062 * handler.
7063 *
7064 * Returns consumed size if succeed, or raise a WindowsError or
7065 * UnicodeDecodeError exception and returns -1 on error.
7066 */
7067static int
7068decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007069 PyObject **v,
7070 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 const char *errors)
7072{
7073 const char *startin = in;
7074 const char *endin = in + size;
7075 const DWORD flags = decode_code_page_flags(code_page);
7076 /* Ideally, we should get reason from FormatMessage. This is the Windows
7077 2000 English version of the message. */
7078 const char *reason = "No mapping for the Unicode character exists "
7079 "in the target code page.";
7080 /* each step cannot decode more than 1 character, but a character can be
7081 represented as a surrogate pair */
7082 wchar_t buffer[2], *startout, *out;
7083 int insize, outsize;
7084 PyObject *errorHandler = NULL;
7085 PyObject *exc = NULL;
7086 PyObject *encoding_obj = NULL;
7087 char *encoding;
7088 DWORD err;
7089 int ret = -1;
7090
7091 assert(size > 0);
7092
7093 encoding = code_page_name(code_page, &encoding_obj);
7094 if (encoding == NULL)
7095 return -1;
7096
7097 if (errors == NULL || strcmp(errors, "strict") == 0) {
7098 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7099 UnicodeDecodeError. */
7100 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7101 if (exc != NULL) {
7102 PyCodec_StrictErrors(exc);
7103 Py_CLEAR(exc);
7104 }
7105 goto error;
7106 }
7107
7108 if (*v == NULL) {
7109 /* Create unicode object */
7110 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7111 PyErr_NoMemory();
7112 goto error;
7113 }
Victor Stinnerab595942011-12-17 04:59:06 +01007114 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007115 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 if (*v == NULL)
7117 goto error;
7118 startout = PyUnicode_AS_UNICODE(*v);
7119 }
7120 else {
7121 /* Extend unicode object */
7122 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7123 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7124 PyErr_NoMemory();
7125 goto error;
7126 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007127 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 goto error;
7129 startout = PyUnicode_AS_UNICODE(*v) + n;
7130 }
7131
7132 /* Decode the byte string character per character */
7133 out = startout;
7134 while (in < endin)
7135 {
7136 /* Decode a character */
7137 insize = 1;
7138 do
7139 {
7140 outsize = MultiByteToWideChar(code_page, flags,
7141 in, insize,
7142 buffer, Py_ARRAY_LENGTH(buffer));
7143 if (outsize > 0)
7144 break;
7145 err = GetLastError();
7146 if (err != ERROR_NO_UNICODE_TRANSLATION
7147 && err != ERROR_INSUFFICIENT_BUFFER)
7148 {
7149 PyErr_SetFromWindowsErr(0);
7150 goto error;
7151 }
7152 insize++;
7153 }
7154 /* 4=maximum length of a UTF-8 sequence */
7155 while (insize <= 4 && (in + insize) <= endin);
7156
7157 if (outsize <= 0) {
7158 Py_ssize_t startinpos, endinpos, outpos;
7159
7160 startinpos = in - startin;
7161 endinpos = startinpos + 1;
7162 outpos = out - PyUnicode_AS_UNICODE(*v);
7163 if (unicode_decode_call_errorhandler(
7164 errors, &errorHandler,
7165 encoding, reason,
7166 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007167 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 {
7169 goto error;
7170 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007171 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 }
7173 else {
7174 in += insize;
7175 memcpy(out, buffer, outsize * sizeof(wchar_t));
7176 out += outsize;
7177 }
7178 }
7179
7180 /* write a NUL character at the end */
7181 *out = 0;
7182
7183 /* Extend unicode object */
7184 outsize = out - startout;
7185 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007186 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007188 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007189
7190error:
7191 Py_XDECREF(encoding_obj);
7192 Py_XDECREF(errorHandler);
7193 Py_XDECREF(exc);
7194 return ret;
7195}
7196
Victor Stinner3a50e702011-10-18 21:21:00 +02007197static PyObject *
7198decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007199 const char *s, Py_ssize_t size,
7200 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201{
Victor Stinner76a31a62011-11-04 00:05:13 +01007202 PyObject *v = NULL;
7203 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 if (code_page < 0) {
7206 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7207 return NULL;
7208 }
7209
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007212
Victor Stinner76a31a62011-11-04 00:05:13 +01007213 do
7214 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007215#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007216 if (size > INT_MAX) {
7217 chunk_size = INT_MAX;
7218 final = 0;
7219 done = 0;
7220 }
7221 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007222#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007223 {
7224 chunk_size = (int)size;
7225 final = (consumed == NULL);
7226 done = 1;
7227 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007228
Victor Stinner76a31a62011-11-04 00:05:13 +01007229 /* Skip trailing lead-byte unless 'final' is set */
7230 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7231 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007232
Victor Stinner76a31a62011-11-04 00:05:13 +01007233 if (chunk_size == 0 && done) {
7234 if (v != NULL)
7235 break;
7236 Py_INCREF(unicode_empty);
7237 return unicode_empty;
7238 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007239
Victor Stinner76a31a62011-11-04 00:05:13 +01007240
7241 converted = decode_code_page_strict(code_page, &v,
7242 s, chunk_size);
7243 if (converted == -2)
7244 converted = decode_code_page_errors(code_page, &v,
7245 s, chunk_size,
7246 errors);
7247 assert(converted != 0);
7248
7249 if (converted < 0) {
7250 Py_XDECREF(v);
7251 return NULL;
7252 }
7253
7254 if (consumed)
7255 *consumed += converted;
7256
7257 s += converted;
7258 size -= converted;
7259 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007260
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007261 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007262}
7263
Alexander Belopolsky40018472011-02-26 01:02:56 +00007264PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007265PyUnicode_DecodeCodePageStateful(int code_page,
7266 const char *s,
7267 Py_ssize_t size,
7268 const char *errors,
7269 Py_ssize_t *consumed)
7270{
7271 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7272}
7273
7274PyObject *
7275PyUnicode_DecodeMBCSStateful(const char *s,
7276 Py_ssize_t size,
7277 const char *errors,
7278 Py_ssize_t *consumed)
7279{
7280 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7281}
7282
7283PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007284PyUnicode_DecodeMBCS(const char *s,
7285 Py_ssize_t size,
7286 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007287{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007288 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7289}
7290
Victor Stinner3a50e702011-10-18 21:21:00 +02007291static DWORD
7292encode_code_page_flags(UINT code_page, const char *errors)
7293{
7294 if (code_page == CP_UTF8) {
7295 if (winver.dwMajorVersion >= 6)
7296 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7297 and later */
7298 return WC_ERR_INVALID_CHARS;
7299 else
7300 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7301 return 0;
7302 }
7303 else if (code_page == CP_UTF7) {
7304 /* CP_UTF7 only supports flags=0 */
7305 return 0;
7306 }
7307 else {
7308 if (errors != NULL && strcmp(errors, "replace") == 0)
7309 return 0;
7310 else
7311 return WC_NO_BEST_FIT_CHARS;
7312 }
7313}
7314
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 * Encode a Unicode string to a Windows code page into a byte string in strict
7317 * mode.
7318 *
7319 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7320 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007322static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007323encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007324 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326{
Victor Stinner554f3f02010-06-16 23:33:54 +00007327 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 BOOL *pusedDefaultChar = &usedDefaultChar;
7329 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007330 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007331 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007332 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 const DWORD flags = encode_code_page_flags(code_page, NULL);
7334 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007335 /* Create a substring so that we can get the UTF-16 representation
7336 of just the slice under consideration. */
7337 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Martin v. Löwis3d325192011-11-04 18:23:06 +01007339 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007340
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007342 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007344 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007345
Victor Stinner2fc507f2011-11-04 20:06:39 +01007346 substring = PyUnicode_Substring(unicode, offset, offset+len);
7347 if (substring == NULL)
7348 return -1;
7349 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7350 if (p == NULL) {
7351 Py_DECREF(substring);
7352 return -1;
7353 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007355 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 outsize = WideCharToMultiByte(code_page, flags,
7357 p, size,
7358 NULL, 0,
7359 NULL, pusedDefaultChar);
7360 if (outsize <= 0)
7361 goto error;
7362 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007363 if (pusedDefaultChar && *pusedDefaultChar) {
7364 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007366 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007367
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007371 if (*outbytes == NULL) {
7372 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007374 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007376 }
7377 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007379 const Py_ssize_t n = PyBytes_Size(*outbytes);
7380 if (outsize > PY_SSIZE_T_MAX - n) {
7381 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007382 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007385 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7386 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007388 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390 }
7391
7392 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 outsize = WideCharToMultiByte(code_page, flags,
7394 p, size,
7395 out, outsize,
7396 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007397 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 if (outsize <= 0)
7399 goto error;
7400 if (pusedDefaultChar && *pusedDefaultChar)
7401 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007402 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007403
Victor Stinner3a50e702011-10-18 21:21:00 +02007404error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007405 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7407 return -2;
7408 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007409 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007410}
7411
Victor Stinner3a50e702011-10-18 21:21:00 +02007412/*
7413 * Encode a Unicode string to a Windows code page into a byte string using a
7414 * error handler.
7415 *
7416 * Returns consumed characters if succeed, or raise a WindowsError and returns
7417 * -1 on other error.
7418 */
7419static int
7420encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007421 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007422 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007423{
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007425 Py_ssize_t pos = unicode_offset;
7426 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 /* Ideally, we should get reason from FormatMessage. This is the Windows
7428 2000 English version of the message. */
7429 const char *reason = "invalid character";
7430 /* 4=maximum length of a UTF-8 sequence */
7431 char buffer[4];
7432 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7433 Py_ssize_t outsize;
7434 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 PyObject *errorHandler = NULL;
7436 PyObject *exc = NULL;
7437 PyObject *encoding_obj = NULL;
7438 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007439 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 PyObject *rep;
7441 int ret = -1;
7442
7443 assert(insize > 0);
7444
7445 encoding = code_page_name(code_page, &encoding_obj);
7446 if (encoding == NULL)
7447 return -1;
7448
7449 if (errors == NULL || strcmp(errors, "strict") == 0) {
7450 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7451 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007452 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 if (exc != NULL) {
7454 PyCodec_StrictErrors(exc);
7455 Py_DECREF(exc);
7456 }
7457 Py_XDECREF(encoding_obj);
7458 return -1;
7459 }
7460
7461 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7462 pusedDefaultChar = &usedDefaultChar;
7463 else
7464 pusedDefaultChar = NULL;
7465
7466 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7467 PyErr_NoMemory();
7468 goto error;
7469 }
7470 outsize = insize * Py_ARRAY_LENGTH(buffer);
7471
7472 if (*outbytes == NULL) {
7473 /* Create string object */
7474 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7475 if (*outbytes == NULL)
7476 goto error;
7477 out = PyBytes_AS_STRING(*outbytes);
7478 }
7479 else {
7480 /* Extend string object */
7481 Py_ssize_t n = PyBytes_Size(*outbytes);
7482 if (n > PY_SSIZE_T_MAX - outsize) {
7483 PyErr_NoMemory();
7484 goto error;
7485 }
7486 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7487 goto error;
7488 out = PyBytes_AS_STRING(*outbytes) + n;
7489 }
7490
7491 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007492 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7495 wchar_t chars[2];
7496 int charsize;
7497 if (ch < 0x10000) {
7498 chars[0] = (wchar_t)ch;
7499 charsize = 1;
7500 }
7501 else {
7502 ch -= 0x10000;
7503 chars[0] = 0xd800 + (ch >> 10);
7504 chars[1] = 0xdc00 + (ch & 0x3ff);
7505 charsize = 2;
7506 }
7507
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007509 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 buffer, Py_ARRAY_LENGTH(buffer),
7511 NULL, pusedDefaultChar);
7512 if (outsize > 0) {
7513 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7514 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007515 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 memcpy(out, buffer, outsize);
7517 out += outsize;
7518 continue;
7519 }
7520 }
7521 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7522 PyErr_SetFromWindowsErr(0);
7523 goto error;
7524 }
7525
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 rep = unicode_encode_call_errorhandler(
7527 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 if (rep == NULL)
7531 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007532 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007533
7534 if (PyBytes_Check(rep)) {
7535 outsize = PyBytes_GET_SIZE(rep);
7536 if (outsize != 1) {
7537 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7538 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7539 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7540 Py_DECREF(rep);
7541 goto error;
7542 }
7543 out = PyBytes_AS_STRING(*outbytes) + offset;
7544 }
7545 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7546 out += outsize;
7547 }
7548 else {
7549 Py_ssize_t i;
7550 enum PyUnicode_Kind kind;
7551 void *data;
7552
Benjamin Petersonbac79492012-01-14 13:34:47 -05007553 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 Py_DECREF(rep);
7555 goto error;
7556 }
7557
7558 outsize = PyUnicode_GET_LENGTH(rep);
7559 if (outsize != 1) {
7560 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7561 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7562 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7563 Py_DECREF(rep);
7564 goto error;
7565 }
7566 out = PyBytes_AS_STRING(*outbytes) + offset;
7567 }
7568 kind = PyUnicode_KIND(rep);
7569 data = PyUnicode_DATA(rep);
7570 for (i=0; i < outsize; i++) {
7571 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7572 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007573 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007574 encoding, unicode,
7575 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 "unable to encode error handler result to ASCII");
7577 Py_DECREF(rep);
7578 goto error;
7579 }
7580 *out = (unsigned char)ch;
7581 out++;
7582 }
7583 }
7584 Py_DECREF(rep);
7585 }
7586 /* write a NUL byte */
7587 *out = 0;
7588 outsize = out - PyBytes_AS_STRING(*outbytes);
7589 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7590 if (_PyBytes_Resize(outbytes, outsize) < 0)
7591 goto error;
7592 ret = 0;
7593
7594error:
7595 Py_XDECREF(encoding_obj);
7596 Py_XDECREF(errorHandler);
7597 Py_XDECREF(exc);
7598 return ret;
7599}
7600
Victor Stinner3a50e702011-10-18 21:21:00 +02007601static PyObject *
7602encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007603 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 const char *errors)
7605{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007608 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007609 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007610
Benjamin Petersonbac79492012-01-14 13:34:47 -05007611 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007612 return NULL;
7613 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007614
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 if (code_page < 0) {
7616 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7617 return NULL;
7618 }
7619
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007621 return PyBytes_FromStringAndSize(NULL, 0);
7622
Victor Stinner7581cef2011-11-03 22:32:33 +01007623 offset = 0;
7624 do
7625 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007626#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007627 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007628 chunks. */
7629 if (len > INT_MAX/2) {
7630 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007631 done = 0;
7632 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007633 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007634#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007635 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007636 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007637 done = 1;
7638 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007639
Victor Stinner76a31a62011-11-04 00:05:13 +01007640 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007641 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007642 errors);
7643 if (ret == -2)
7644 ret = encode_code_page_errors(code_page, &outbytes,
7645 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007647 if (ret < 0) {
7648 Py_XDECREF(outbytes);
7649 return NULL;
7650 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007651
Victor Stinner7581cef2011-11-03 22:32:33 +01007652 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007653 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007654 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007655
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 return outbytes;
7657}
7658
7659PyObject *
7660PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7661 Py_ssize_t size,
7662 const char *errors)
7663{
Victor Stinner7581cef2011-11-03 22:32:33 +01007664 PyObject *unicode, *res;
7665 unicode = PyUnicode_FromUnicode(p, size);
7666 if (unicode == NULL)
7667 return NULL;
7668 res = encode_code_page(CP_ACP, unicode, errors);
7669 Py_DECREF(unicode);
7670 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007671}
7672
7673PyObject *
7674PyUnicode_EncodeCodePage(int code_page,
7675 PyObject *unicode,
7676 const char *errors)
7677{
Victor Stinner7581cef2011-11-03 22:32:33 +01007678 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007679}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007680
Alexander Belopolsky40018472011-02-26 01:02:56 +00007681PyObject *
7682PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007683{
7684 if (!PyUnicode_Check(unicode)) {
7685 PyErr_BadArgument();
7686 return NULL;
7687 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007688 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007689}
7690
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007691#undef NEED_RETRY
7692
Victor Stinner99b95382011-07-04 14:23:54 +02007693#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007694
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695/* --- Character Mapping Codec -------------------------------------------- */
7696
Alexander Belopolsky40018472011-02-26 01:02:56 +00007697PyObject *
7698PyUnicode_DecodeCharmap(const char *s,
7699 Py_ssize_t size,
7700 PyObject *mapping,
7701 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007704 Py_ssize_t startinpos;
7705 Py_ssize_t endinpos;
7706 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007708 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007709 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 PyObject *errorHandler = NULL;
7711 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007712
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 /* Default to Latin-1 */
7714 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007717 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007721 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007722 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007723 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007724 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007725 Py_ssize_t maplen;
7726 enum PyUnicode_Kind kind;
7727 void *data;
7728 Py_UCS4 x;
7729
Benjamin Petersonbac79492012-01-14 13:34:47 -05007730 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007731 return NULL;
7732
7733 maplen = PyUnicode_GET_LENGTH(mapping);
7734 data = PyUnicode_DATA(mapping);
7735 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 while (s < e) {
7737 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007740 x = PyUnicode_READ(kind, data, ch);
7741 else
7742 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007744 if (x == 0xfffe)
7745 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 startinpos = s-starts;
7748 endinpos = startinpos+1;
7749 if (unicode_decode_call_errorhandler(
7750 errors, &errorHandler,
7751 "charmap", "character maps to <undefined>",
7752 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007753 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 goto onError;
7755 }
7756 continue;
7757 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007758
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007759 if (unicode_putchar(&v, &outpos, x) < 0)
7760 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007762 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007763 }
7764 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 while (s < e) {
7766 unsigned char ch = *s;
7767 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007768
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7770 w = PyLong_FromLong((long)ch);
7771 if (w == NULL)
7772 goto onError;
7773 x = PyObject_GetItem(mapping, w);
7774 Py_DECREF(w);
7775 if (x == NULL) {
7776 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7777 /* No mapping found means: mapping is undefined. */
7778 PyErr_Clear();
7779 x = Py_None;
7780 Py_INCREF(x);
7781 } else
7782 goto onError;
7783 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 /* Apply mapping */
7786 if (PyLong_Check(x)) {
7787 long value = PyLong_AS_LONG(x);
7788 if (value < 0 || value > 65535) {
7789 PyErr_SetString(PyExc_TypeError,
7790 "character mapping must be in range(65536)");
7791 Py_DECREF(x);
7792 goto onError;
7793 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007794 if (unicode_putchar(&v, &outpos, value) < 0)
7795 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 }
7797 else if (x == Py_None) {
7798 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 startinpos = s-starts;
7800 endinpos = startinpos+1;
7801 if (unicode_decode_call_errorhandler(
7802 errors, &errorHandler,
7803 "charmap", "character maps to <undefined>",
7804 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007805 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 Py_DECREF(x);
7807 goto onError;
7808 }
7809 Py_DECREF(x);
7810 continue;
7811 }
7812 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007813 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007814
Benjamin Petersonbac79492012-01-14 13:34:47 -05007815 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007816 goto onError;
7817 targetsize = PyUnicode_GET_LENGTH(x);
7818
7819 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007821 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007822 PyUnicode_READ_CHAR(x, 0)) < 0)
7823 goto onError;
7824 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 else if (targetsize > 1) {
7826 /* 1-n mapping */
7827 if (targetsize > extrachars) {
7828 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 Py_ssize_t needed = (targetsize - extrachars) + \
7830 (targetsize << 2);
7831 extrachars += needed;
7832 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007833 if (unicode_resize(&v,
7834 PyUnicode_GET_LENGTH(v) + needed) < 0)
7835 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 Py_DECREF(x);
7837 goto onError;
7838 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007840 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7841 goto onError;
7842 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7843 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 extrachars -= targetsize;
7845 }
7846 /* 1-0 mapping: skip the character */
7847 }
7848 else {
7849 /* wrong return value */
7850 PyErr_SetString(PyExc_TypeError,
7851 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007852 Py_DECREF(x);
7853 goto onError;
7854 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 Py_DECREF(x);
7856 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007859 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007860 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007861 Py_XDECREF(errorHandler);
7862 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007863 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007864
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866 Py_XDECREF(errorHandler);
7867 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 Py_XDECREF(v);
7869 return NULL;
7870}
7871
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007872/* Charmap encoding: the lookup table */
7873
Alexander Belopolsky40018472011-02-26 01:02:56 +00007874struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 PyObject_HEAD
7876 unsigned char level1[32];
7877 int count2, count3;
7878 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879};
7880
7881static PyObject*
7882encoding_map_size(PyObject *obj, PyObject* args)
7883{
7884 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007885 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887}
7888
7889static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007890 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 PyDoc_STR("Return the size (in bytes) of this object") },
7892 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893};
7894
7895static void
7896encoding_map_dealloc(PyObject* o)
7897{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007898 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899}
7900
7901static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007902 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 "EncodingMap", /*tp_name*/
7904 sizeof(struct encoding_map), /*tp_basicsize*/
7905 0, /*tp_itemsize*/
7906 /* methods */
7907 encoding_map_dealloc, /*tp_dealloc*/
7908 0, /*tp_print*/
7909 0, /*tp_getattr*/
7910 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007911 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 0, /*tp_repr*/
7913 0, /*tp_as_number*/
7914 0, /*tp_as_sequence*/
7915 0, /*tp_as_mapping*/
7916 0, /*tp_hash*/
7917 0, /*tp_call*/
7918 0, /*tp_str*/
7919 0, /*tp_getattro*/
7920 0, /*tp_setattro*/
7921 0, /*tp_as_buffer*/
7922 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7923 0, /*tp_doc*/
7924 0, /*tp_traverse*/
7925 0, /*tp_clear*/
7926 0, /*tp_richcompare*/
7927 0, /*tp_weaklistoffset*/
7928 0, /*tp_iter*/
7929 0, /*tp_iternext*/
7930 encoding_map_methods, /*tp_methods*/
7931 0, /*tp_members*/
7932 0, /*tp_getset*/
7933 0, /*tp_base*/
7934 0, /*tp_dict*/
7935 0, /*tp_descr_get*/
7936 0, /*tp_descr_set*/
7937 0, /*tp_dictoffset*/
7938 0, /*tp_init*/
7939 0, /*tp_alloc*/
7940 0, /*tp_new*/
7941 0, /*tp_free*/
7942 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943};
7944
7945PyObject*
7946PyUnicode_BuildEncodingMap(PyObject* string)
7947{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007948 PyObject *result;
7949 struct encoding_map *mresult;
7950 int i;
7951 int need_dict = 0;
7952 unsigned char level1[32];
7953 unsigned char level2[512];
7954 unsigned char *mlevel1, *mlevel2, *mlevel3;
7955 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 int kind;
7957 void *data;
7958 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007960 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007961 PyErr_BadArgument();
7962 return NULL;
7963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007964 kind = PyUnicode_KIND(string);
7965 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007966 memset(level1, 0xFF, sizeof level1);
7967 memset(level2, 0xFF, sizeof level2);
7968
7969 /* If there isn't a one-to-one mapping of NULL to \0,
7970 or if there are non-BMP characters, we need to use
7971 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973 need_dict = 1;
7974 for (i = 1; i < 256; i++) {
7975 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007976 ch = PyUnicode_READ(kind, data, i);
7977 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007978 need_dict = 1;
7979 break;
7980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007982 /* unmapped character */
7983 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984 l1 = ch >> 11;
7985 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007986 if (level1[l1] == 0xFF)
7987 level1[l1] = count2++;
7988 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007989 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007990 }
7991
7992 if (count2 >= 0xFF || count3 >= 0xFF)
7993 need_dict = 1;
7994
7995 if (need_dict) {
7996 PyObject *result = PyDict_New();
7997 PyObject *key, *value;
7998 if (!result)
7999 return NULL;
8000 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008001 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008002 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003 if (!key || !value)
8004 goto failed1;
8005 if (PyDict_SetItem(result, key, value) == -1)
8006 goto failed1;
8007 Py_DECREF(key);
8008 Py_DECREF(value);
8009 }
8010 return result;
8011 failed1:
8012 Py_XDECREF(key);
8013 Py_XDECREF(value);
8014 Py_DECREF(result);
8015 return NULL;
8016 }
8017
8018 /* Create a three-level trie */
8019 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8020 16*count2 + 128*count3 - 1);
8021 if (!result)
8022 return PyErr_NoMemory();
8023 PyObject_Init(result, &EncodingMapType);
8024 mresult = (struct encoding_map*)result;
8025 mresult->count2 = count2;
8026 mresult->count3 = count3;
8027 mlevel1 = mresult->level1;
8028 mlevel2 = mresult->level23;
8029 mlevel3 = mresult->level23 + 16*count2;
8030 memcpy(mlevel1, level1, 32);
8031 memset(mlevel2, 0xFF, 16*count2);
8032 memset(mlevel3, 0, 128*count3);
8033 count3 = 0;
8034 for (i = 1; i < 256; i++) {
8035 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037 /* unmapped character */
8038 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008039 o1 = PyUnicode_READ(kind, data, i)>>11;
8040 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041 i2 = 16*mlevel1[o1] + o2;
8042 if (mlevel2[i2] == 0xFF)
8043 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008044 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045 i3 = 128*mlevel2[i2] + o3;
8046 mlevel3[i3] = i;
8047 }
8048 return result;
8049}
8050
8051static int
Victor Stinner22168992011-11-20 17:09:18 +01008052encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053{
8054 struct encoding_map *map = (struct encoding_map*)mapping;
8055 int l1 = c>>11;
8056 int l2 = (c>>7) & 0xF;
8057 int l3 = c & 0x7F;
8058 int i;
8059
Victor Stinner22168992011-11-20 17:09:18 +01008060 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062 if (c == 0)
8063 return 0;
8064 /* level 1*/
8065 i = map->level1[l1];
8066 if (i == 0xFF) {
8067 return -1;
8068 }
8069 /* level 2*/
8070 i = map->level23[16*i+l2];
8071 if (i == 0xFF) {
8072 return -1;
8073 }
8074 /* level 3 */
8075 i = map->level23[16*map->count2 + 128*i + l3];
8076 if (i == 0) {
8077 return -1;
8078 }
8079 return i;
8080}
8081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082/* Lookup the character ch in the mapping. If the character
8083 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008084 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008085static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008086charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087{
Christian Heimes217cfd12007-12-02 14:31:20 +00008088 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 PyObject *x;
8090
8091 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093 x = PyObject_GetItem(mapping, w);
8094 Py_DECREF(w);
8095 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8097 /* No mapping found means: mapping is undefined. */
8098 PyErr_Clear();
8099 x = Py_None;
8100 Py_INCREF(x);
8101 return x;
8102 } else
8103 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008105 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008107 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 long value = PyLong_AS_LONG(x);
8109 if (value < 0 || value > 255) {
8110 PyErr_SetString(PyExc_TypeError,
8111 "character mapping must be in range(256)");
8112 Py_DECREF(x);
8113 return NULL;
8114 }
8115 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008117 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 /* wrong return value */
8121 PyErr_Format(PyExc_TypeError,
8122 "character mapping must return integer, bytes or None, not %.400s",
8123 x->ob_type->tp_name);
8124 Py_DECREF(x);
8125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 }
8127}
8128
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008130charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008132 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8133 /* exponentially overallocate to minimize reallocations */
8134 if (requiredsize < 2*outsize)
8135 requiredsize = 2*outsize;
8136 if (_PyBytes_Resize(outobj, requiredsize))
8137 return -1;
8138 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139}
8140
Benjamin Peterson14339b62009-01-31 16:36:08 +00008141typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008143} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008145 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 space is available. Return a new reference to the object that
8147 was put in the output buffer, or Py_None, if the mapping was undefined
8148 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008149 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008150static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008151charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008152 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 PyObject *rep;
8155 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008156 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008157
Christian Heimes90aa7642007-12-19 02:45:37 +00008158 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 if (res == -1)
8162 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 if (outsize<requiredsize)
8164 if (charmapencode_resize(outobj, outpos, requiredsize))
8165 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008166 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 outstart[(*outpos)++] = (char)res;
8168 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 }
8170
8171 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008172 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 Py_DECREF(rep);
8176 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 if (PyLong_Check(rep)) {
8179 Py_ssize_t requiredsize = *outpos+1;
8180 if (outsize<requiredsize)
8181 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8182 Py_DECREF(rep);
8183 return enc_EXCEPTION;
8184 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008185 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 else {
8189 const char *repchars = PyBytes_AS_STRING(rep);
8190 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8191 Py_ssize_t requiredsize = *outpos+repsize;
8192 if (outsize<requiredsize)
8193 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8194 Py_DECREF(rep);
8195 return enc_EXCEPTION;
8196 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008197 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 memcpy(outstart + *outpos, repchars, repsize);
8199 *outpos += repsize;
8200 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008201 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008202 Py_DECREF(rep);
8203 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008204}
8205
8206/* handle an error in PyUnicode_EncodeCharmap
8207 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008208static int
8209charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008210 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008212 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008213 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214{
8215 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008216 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008217 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008218 enum PyUnicode_Kind kind;
8219 void *data;
8220 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008222 Py_ssize_t collstartpos = *inpos;
8223 Py_ssize_t collendpos = *inpos+1;
8224 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225 char *encoding = "charmap";
8226 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008227 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008228 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008229 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008230
Benjamin Petersonbac79492012-01-14 13:34:47 -05008231 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008232 return -1;
8233 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 /* find all unencodable characters */
8235 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008236 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008237 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008238 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008239 val = encoding_map_lookup(ch, mapping);
8240 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 break;
8242 ++collendpos;
8243 continue;
8244 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008245
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008246 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8247 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 if (rep==NULL)
8249 return -1;
8250 else if (rep!=Py_None) {
8251 Py_DECREF(rep);
8252 break;
8253 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008254 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256 }
8257 /* cache callback name lookup
8258 * (if not done yet, i.e. it's the first error) */
8259 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 if ((errors==NULL) || (!strcmp(errors, "strict")))
8261 *known_errorHandler = 1;
8262 else if (!strcmp(errors, "replace"))
8263 *known_errorHandler = 2;
8264 else if (!strcmp(errors, "ignore"))
8265 *known_errorHandler = 3;
8266 else if (!strcmp(errors, "xmlcharrefreplace"))
8267 *known_errorHandler = 4;
8268 else
8269 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 }
8271 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008273 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008274 return -1;
8275 case 2: /* replace */
8276 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 x = charmapencode_output('?', mapping, res, respos);
8278 if (x==enc_EXCEPTION) {
8279 return -1;
8280 }
8281 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008282 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 return -1;
8284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008285 }
8286 /* fall through */
8287 case 3: /* ignore */
8288 *inpos = collendpos;
8289 break;
8290 case 4: /* xmlcharrefreplace */
8291 /* generate replacement (temporarily (mis)uses p) */
8292 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 char buffer[2+29+1+1];
8294 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008295 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 for (cp = buffer; *cp; ++cp) {
8297 x = charmapencode_output(*cp, mapping, res, respos);
8298 if (x==enc_EXCEPTION)
8299 return -1;
8300 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008301 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 return -1;
8303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008304 }
8305 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 *inpos = collendpos;
8307 break;
8308 default:
8309 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008310 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008312 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008314 if (PyBytes_Check(repunicode)) {
8315 /* Directly copy bytes result to output. */
8316 Py_ssize_t outsize = PyBytes_Size(*res);
8317 Py_ssize_t requiredsize;
8318 repsize = PyBytes_Size(repunicode);
8319 requiredsize = *respos + repsize;
8320 if (requiredsize > outsize)
8321 /* Make room for all additional bytes. */
8322 if (charmapencode_resize(res, respos, requiredsize)) {
8323 Py_DECREF(repunicode);
8324 return -1;
8325 }
8326 memcpy(PyBytes_AsString(*res) + *respos,
8327 PyBytes_AsString(repunicode), repsize);
8328 *respos += repsize;
8329 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008330 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008331 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008333 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008334 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008335 Py_DECREF(repunicode);
8336 return -1;
8337 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008338 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008339 data = PyUnicode_DATA(repunicode);
8340 kind = PyUnicode_KIND(repunicode);
8341 for (index = 0; index < repsize; index++) {
8342 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8343 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008345 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 return -1;
8347 }
8348 else if (x==enc_FAILED) {
8349 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008350 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 return -1;
8352 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008353 }
8354 *inpos = newpos;
8355 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356 }
8357 return 0;
8358}
8359
Alexander Belopolsky40018472011-02-26 01:02:56 +00008360PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008361_PyUnicode_EncodeCharmap(PyObject *unicode,
8362 PyObject *mapping,
8363 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 /* output object */
8366 PyObject *res = NULL;
8367 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008368 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008369 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008371 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 PyObject *errorHandler = NULL;
8373 PyObject *exc = NULL;
8374 /* the following variable is used for caching string comparisons
8375 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8376 * 3=ignore, 4=xmlcharrefreplace */
8377 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378
Benjamin Petersonbac79492012-01-14 13:34:47 -05008379 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008380 return NULL;
8381 size = PyUnicode_GET_LENGTH(unicode);
8382
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 /* Default to Latin-1 */
8384 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008385 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 /* allocate enough for a simple encoding without
8388 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008389 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 if (res == NULL)
8391 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008392 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008396 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008398 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 if (x==enc_EXCEPTION) /* error */
8400 goto onError;
8401 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 &exc,
8404 &known_errorHandler, &errorHandler, errors,
8405 &res, &respos)) {
8406 goto onError;
8407 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 else
8410 /* done with this character => adjust input position */
8411 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008415 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008416 if (_PyBytes_Resize(&res, respos) < 0)
8417 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 Py_XDECREF(exc);
8420 Py_XDECREF(errorHandler);
8421 return res;
8422
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 Py_XDECREF(res);
8425 Py_XDECREF(exc);
8426 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 return NULL;
8428}
8429
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008430/* Deprecated */
8431PyObject *
8432PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8433 Py_ssize_t size,
8434 PyObject *mapping,
8435 const char *errors)
8436{
8437 PyObject *result;
8438 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8439 if (unicode == NULL)
8440 return NULL;
8441 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8442 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008443 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008444}
8445
Alexander Belopolsky40018472011-02-26 01:02:56 +00008446PyObject *
8447PyUnicode_AsCharmapString(PyObject *unicode,
8448 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449{
8450 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 PyErr_BadArgument();
8452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008454 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455}
8456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008458static void
8459make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008461 Py_ssize_t startpos, Py_ssize_t endpos,
8462 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 *exceptionObject = _PyUnicodeTranslateError_Create(
8466 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 }
8468 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8470 goto onError;
8471 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8472 goto onError;
8473 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8474 goto onError;
8475 return;
8476 onError:
8477 Py_DECREF(*exceptionObject);
8478 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 }
8480}
8481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008483static void
8484raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008486 Py_ssize_t startpos, Py_ssize_t endpos,
8487 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488{
8489 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008491 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493}
8494
8495/* error handling callback helper:
8496 build arguments, call the callback and check the arguments,
8497 put the result into newpos and return the replacement string, which
8498 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008499static PyObject *
8500unicode_translate_call_errorhandler(const char *errors,
8501 PyObject **errorHandler,
8502 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008504 Py_ssize_t startpos, Py_ssize_t endpos,
8505 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008507 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008509 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510 PyObject *restuple;
8511 PyObject *resunicode;
8512
8513 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 }
8518
8519 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523
8524 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008529 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 Py_DECREF(restuple);
8531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 }
8533 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 &resunicode, &i_newpos)) {
8535 Py_DECREF(restuple);
8536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 else
8541 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8544 Py_DECREF(restuple);
8545 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 Py_INCREF(resunicode);
8548 Py_DECREF(restuple);
8549 return resunicode;
8550}
8551
8552/* Lookup the character ch in the mapping and put the result in result,
8553 which must be decrefed by the caller.
8554 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008555static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557{
Christian Heimes217cfd12007-12-02 14:31:20 +00008558 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 PyObject *x;
8560
8561 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 x = PyObject_GetItem(mapping, w);
8564 Py_DECREF(w);
8565 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8567 /* No mapping found means: use 1:1 mapping. */
8568 PyErr_Clear();
8569 *result = NULL;
8570 return 0;
8571 } else
8572 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 }
8574 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 *result = x;
8576 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008578 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 long value = PyLong_AS_LONG(x);
8580 long max = PyUnicode_GetMax();
8581 if (value < 0 || value > max) {
8582 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008583 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 Py_DECREF(x);
8585 return -1;
8586 }
8587 *result = x;
8588 return 0;
8589 }
8590 else if (PyUnicode_Check(x)) {
8591 *result = x;
8592 return 0;
8593 }
8594 else {
8595 /* wrong return value */
8596 PyErr_SetString(PyExc_TypeError,
8597 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008598 Py_DECREF(x);
8599 return -1;
8600 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601}
8602/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 if not reallocate and adjust various state variables.
8604 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008605static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008610 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 /* exponentially overallocate to minimize reallocations */
8612 if (requiredsize < 2 * oldsize)
8613 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8615 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 }
8619 return 0;
8620}
8621/* lookup the character, put the result in the output string and adjust
8622 various state variables. Return a new reference to the object that
8623 was put in the output buffer in *result, or Py_None, if the mapping was
8624 undefined (in which case no character was written).
8625 The called must decref result.
8626 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008627static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8629 PyObject *mapping, Py_UCS4 **output,
8630 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8634 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 }
8640 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008642 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 }
8646 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 Py_ssize_t repsize;
8648 if (PyUnicode_READY(*res) == -1)
8649 return -1;
8650 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 if (repsize==1) {
8652 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 }
8655 else if (repsize!=0) {
8656 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 Py_ssize_t requiredsize = *opos +
8658 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 Py_ssize_t i;
8661 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 for(i = 0; i < repsize; i++)
8664 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 }
8667 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 return 0;
8670}
8671
Alexander Belopolsky40018472011-02-26 01:02:56 +00008672PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673_PyUnicode_TranslateCharmap(PyObject *input,
8674 PyObject *mapping,
8675 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 /* input object */
8678 char *idata;
8679 Py_ssize_t size, i;
8680 int kind;
8681 /* output buffer */
8682 Py_UCS4 *output = NULL;
8683 Py_ssize_t osize;
8684 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 char *reason = "character maps to <undefined>";
8688 PyObject *errorHandler = NULL;
8689 PyObject *exc = NULL;
8690 /* the following variable is used for caching string comparisons
8691 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8692 * 3=ignore, 4=xmlcharrefreplace */
8693 int known_errorHandler = -1;
8694
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 PyErr_BadArgument();
8697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 if (PyUnicode_READY(input) == -1)
8701 return NULL;
8702 idata = (char*)PyUnicode_DATA(input);
8703 kind = PyUnicode_KIND(input);
8704 size = PyUnicode_GET_LENGTH(input);
8705 i = 0;
8706
8707 if (size == 0) {
8708 Py_INCREF(input);
8709 return input;
8710 }
8711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 /* allocate enough for a simple 1:1 translation without
8713 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 osize = size;
8715 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8716 opos = 0;
8717 if (output == NULL) {
8718 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 /* try to encode it */
8724 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 if (charmaptranslate_output(input, i, mapping,
8726 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 Py_XDECREF(x);
8728 goto onError;
8729 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008730 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 else { /* untranslatable character */
8734 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8735 Py_ssize_t repsize;
8736 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 Py_ssize_t collstart = i;
8740 Py_ssize_t collend = i+1;
8741 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744 while (collend < size) {
8745 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 goto onError;
8747 Py_XDECREF(x);
8748 if (x!=Py_None)
8749 break;
8750 ++collend;
8751 }
8752 /* cache callback name lookup
8753 * (if not done yet, i.e. it's the first error) */
8754 if (known_errorHandler==-1) {
8755 if ((errors==NULL) || (!strcmp(errors, "strict")))
8756 known_errorHandler = 1;
8757 else if (!strcmp(errors, "replace"))
8758 known_errorHandler = 2;
8759 else if (!strcmp(errors, "ignore"))
8760 known_errorHandler = 3;
8761 else if (!strcmp(errors, "xmlcharrefreplace"))
8762 known_errorHandler = 4;
8763 else
8764 known_errorHandler = 0;
8765 }
8766 switch (known_errorHandler) {
8767 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 raise_translate_exception(&exc, input, collstart,
8769 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008770 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 case 2: /* replace */
8772 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 for (coll = collstart; coll<collend; coll++)
8774 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 /* fall through */
8776 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 break;
8779 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 /* generate replacement (temporarily (mis)uses i) */
8781 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 char buffer[2+29+1+1];
8783 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8785 if (charmaptranslate_makespace(&output, &osize,
8786 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 goto onError;
8788 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 break;
8793 default:
8794 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 reason, input, &exc,
8796 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008797 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008799 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008800 Py_DECREF(repunicode);
8801 goto onError;
8802 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 repsize = PyUnicode_GET_LENGTH(repunicode);
8805 if (charmaptranslate_makespace(&output, &osize,
8806 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008807 Py_DECREF(repunicode);
8808 goto onError;
8809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 for (uni2 = 0; repsize-->0; ++uni2)
8811 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8812 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008814 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008815 }
8816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8818 if (!res)
8819 goto onError;
8820 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008821 Py_XDECREF(exc);
8822 Py_XDECREF(errorHandler);
8823 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008827 Py_XDECREF(exc);
8828 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 return NULL;
8830}
8831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832/* Deprecated. Use PyUnicode_Translate instead. */
8833PyObject *
8834PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8835 Py_ssize_t size,
8836 PyObject *mapping,
8837 const char *errors)
8838{
8839 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8840 if (!unicode)
8841 return NULL;
8842 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8843}
8844
Alexander Belopolsky40018472011-02-26 01:02:56 +00008845PyObject *
8846PyUnicode_Translate(PyObject *str,
8847 PyObject *mapping,
8848 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849{
8850 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008851
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 str = PyUnicode_FromObject(str);
8853 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856 Py_DECREF(str);
8857 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008858
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860 Py_XDECREF(str);
8861 return NULL;
8862}
Tim Petersced69f82003-09-16 20:30:58 +00008863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008865fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866{
8867 /* No need to call PyUnicode_READY(self) because this function is only
8868 called as a callback from fixup() which does it already. */
8869 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8870 const int kind = PyUnicode_KIND(self);
8871 void *data = PyUnicode_DATA(self);
8872 Py_UCS4 maxchar = 0, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008873 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 Py_ssize_t i;
8875
8876 for (i = 0; i < len; ++i) {
8877 ch = PyUnicode_READ(kind, data, i);
8878 fixed = 0;
8879 if (ch > 127) {
8880 if (Py_UNICODE_ISSPACE(ch))
8881 fixed = ' ';
8882 else {
8883 const int decimal = Py_UNICODE_TODECIMAL(ch);
8884 if (decimal >= 0)
8885 fixed = '0' + decimal;
8886 }
8887 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008888 modified = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 if (fixed > maxchar)
8890 maxchar = fixed;
8891 PyUnicode_WRITE(kind, data, i, fixed);
8892 }
8893 else if (ch > maxchar)
8894 maxchar = ch;
8895 }
8896 else if (ch > maxchar)
8897 maxchar = ch;
8898 }
8899
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008900 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901}
8902
8903PyObject *
8904_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8905{
8906 if (!PyUnicode_Check(unicode)) {
8907 PyErr_BadInternalCall();
8908 return NULL;
8909 }
8910 if (PyUnicode_READY(unicode) == -1)
8911 return NULL;
8912 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8913 /* If the string is already ASCII, just return the same string */
8914 Py_INCREF(unicode);
8915 return unicode;
8916 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008917 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918}
8919
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008920PyObject *
8921PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8922 Py_ssize_t length)
8923{
Victor Stinnerf0124502011-11-21 23:12:56 +01008924 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008925 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008926 Py_UCS4 maxchar;
8927 enum PyUnicode_Kind kind;
8928 void *data;
8929
8930 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008931 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008932 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008933 if (ch > 127) {
8934 int decimal = Py_UNICODE_TODECIMAL(ch);
8935 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008936 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008937 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008938 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008939 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008940
8941 /* Copy to a new string */
8942 decimal = PyUnicode_New(length, maxchar);
8943 if (decimal == NULL)
8944 return decimal;
8945 kind = PyUnicode_KIND(decimal);
8946 data = PyUnicode_DATA(decimal);
8947 /* Iterate over code points */
8948 for (i = 0; i < length; i++) {
8949 Py_UNICODE ch = s[i];
8950 if (ch > 127) {
8951 int decimal = Py_UNICODE_TODECIMAL(ch);
8952 if (decimal >= 0)
8953 ch = '0' + decimal;
8954 }
8955 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008957 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008958}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008959/* --- Decimal Encoder ---------------------------------------------------- */
8960
Alexander Belopolsky40018472011-02-26 01:02:56 +00008961int
8962PyUnicode_EncodeDecimal(Py_UNICODE *s,
8963 Py_ssize_t length,
8964 char *output,
8965 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008966{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008967 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008968 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008969 enum PyUnicode_Kind kind;
8970 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008971
8972 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 PyErr_BadArgument();
8974 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008975 }
8976
Victor Stinner42bf7752011-11-21 22:52:58 +01008977 unicode = PyUnicode_FromUnicode(s, length);
8978 if (unicode == NULL)
8979 return -1;
8980
Benjamin Petersonbac79492012-01-14 13:34:47 -05008981 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008982 Py_DECREF(unicode);
8983 return -1;
8984 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008985 kind = PyUnicode_KIND(unicode);
8986 data = PyUnicode_DATA(unicode);
8987
Victor Stinnerb84d7232011-11-22 01:50:07 +01008988 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008989 PyObject *exc;
8990 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008992 Py_ssize_t startpos;
8993
8994 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008995
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008997 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008998 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009000 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 decimal = Py_UNICODE_TODECIMAL(ch);
9002 if (decimal >= 0) {
9003 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009004 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 continue;
9006 }
9007 if (0 < ch && ch < 256) {
9008 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009009 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 continue;
9011 }
Victor Stinner6345be92011-11-25 20:09:01 +01009012
Victor Stinner42bf7752011-11-21 22:52:58 +01009013 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009014 exc = NULL;
9015 raise_encode_exception(&exc, "decimal", unicode,
9016 startpos, startpos+1,
9017 "invalid decimal Unicode string");
9018 Py_XDECREF(exc);
9019 Py_DECREF(unicode);
9020 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009021 }
9022 /* 0-terminate the output string */
9023 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009024 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009025 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009026}
9027
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028/* --- Helpers ------------------------------------------------------------ */
9029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009031any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 Py_ssize_t start,
9033 Py_ssize_t end)
9034{
9035 int kind1, kind2, kind;
9036 void *buf1, *buf2;
9037 Py_ssize_t len1, len2, result;
9038
9039 kind1 = PyUnicode_KIND(s1);
9040 kind2 = PyUnicode_KIND(s2);
9041 kind = kind1 > kind2 ? kind1 : kind2;
9042 buf1 = PyUnicode_DATA(s1);
9043 buf2 = PyUnicode_DATA(s2);
9044 if (kind1 != kind)
9045 buf1 = _PyUnicode_AsKind(s1, kind);
9046 if (!buf1)
9047 return -2;
9048 if (kind2 != kind)
9049 buf2 = _PyUnicode_AsKind(s2, kind);
9050 if (!buf2) {
9051 if (kind1 != kind) PyMem_Free(buf1);
9052 return -2;
9053 }
9054 len1 = PyUnicode_GET_LENGTH(s1);
9055 len2 = PyUnicode_GET_LENGTH(s2);
9056
Victor Stinner794d5672011-10-10 03:21:36 +02009057 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009058 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009059 case PyUnicode_1BYTE_KIND:
9060 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9061 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9062 else
9063 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9064 break;
9065 case PyUnicode_2BYTE_KIND:
9066 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9067 break;
9068 case PyUnicode_4BYTE_KIND:
9069 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9070 break;
9071 default:
9072 assert(0); result = -2;
9073 }
9074 }
9075 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009076 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009077 case PyUnicode_1BYTE_KIND:
9078 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9079 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9080 else
9081 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9082 break;
9083 case PyUnicode_2BYTE_KIND:
9084 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9085 break;
9086 case PyUnicode_4BYTE_KIND:
9087 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9088 break;
9089 default:
9090 assert(0); result = -2;
9091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 }
9093
9094 if (kind1 != kind)
9095 PyMem_Free(buf1);
9096 if (kind2 != kind)
9097 PyMem_Free(buf2);
9098
9099 return result;
9100}
9101
9102Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009103_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 Py_ssize_t n_buffer,
9105 void *digits, Py_ssize_t n_digits,
9106 Py_ssize_t min_width,
9107 const char *grouping,
9108 const char *thousands_sep)
9109{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009110 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009112 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9113 return _PyUnicode_ascii_InsertThousandsGrouping(
9114 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9115 min_width, grouping, thousands_sep);
9116 else
9117 return _PyUnicode_ucs1_InsertThousandsGrouping(
9118 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9119 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 case PyUnicode_2BYTE_KIND:
9121 return _PyUnicode_ucs2_InsertThousandsGrouping(
9122 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9123 min_width, grouping, thousands_sep);
9124 case PyUnicode_4BYTE_KIND:
9125 return _PyUnicode_ucs4_InsertThousandsGrouping(
9126 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9127 min_width, grouping, thousands_sep);
9128 }
9129 assert(0);
9130 return -1;
9131}
9132
9133
Thomas Wouters477c8d52006-05-27 19:21:47 +00009134/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009135#define ADJUST_INDICES(start, end, len) \
9136 if (end > len) \
9137 end = len; \
9138 else if (end < 0) { \
9139 end += len; \
9140 if (end < 0) \
9141 end = 0; \
9142 } \
9143 if (start < 0) { \
9144 start += len; \
9145 if (start < 0) \
9146 start = 0; \
9147 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009148
Alexander Belopolsky40018472011-02-26 01:02:56 +00009149Py_ssize_t
9150PyUnicode_Count(PyObject *str,
9151 PyObject *substr,
9152 Py_ssize_t start,
9153 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009155 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009156 PyObject* str_obj;
9157 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 int kind1, kind2, kind;
9159 void *buf1 = NULL, *buf2 = NULL;
9160 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009161
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009162 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009163 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009165 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009166 if (!sub_obj) {
9167 Py_DECREF(str_obj);
9168 return -1;
9169 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009170 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009171 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 Py_DECREF(str_obj);
9173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174 }
Tim Petersced69f82003-09-16 20:30:58 +00009175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 kind1 = PyUnicode_KIND(str_obj);
9177 kind2 = PyUnicode_KIND(sub_obj);
9178 kind = kind1 > kind2 ? kind1 : kind2;
9179 buf1 = PyUnicode_DATA(str_obj);
9180 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009181 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 if (!buf1)
9183 goto onError;
9184 buf2 = PyUnicode_DATA(sub_obj);
9185 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009186 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 if (!buf2)
9188 goto onError;
9189 len1 = PyUnicode_GET_LENGTH(str_obj);
9190 len2 = PyUnicode_GET_LENGTH(sub_obj);
9191
9192 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009193 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009195 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9196 result = asciilib_count(
9197 ((Py_UCS1*)buf1) + start, end - start,
9198 buf2, len2, PY_SSIZE_T_MAX
9199 );
9200 else
9201 result = ucs1lib_count(
9202 ((Py_UCS1*)buf1) + start, end - start,
9203 buf2, len2, PY_SSIZE_T_MAX
9204 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 break;
9206 case PyUnicode_2BYTE_KIND:
9207 result = ucs2lib_count(
9208 ((Py_UCS2*)buf1) + start, end - start,
9209 buf2, len2, PY_SSIZE_T_MAX
9210 );
9211 break;
9212 case PyUnicode_4BYTE_KIND:
9213 result = ucs4lib_count(
9214 ((Py_UCS4*)buf1) + start, end - start,
9215 buf2, len2, PY_SSIZE_T_MAX
9216 );
9217 break;
9218 default:
9219 assert(0); result = 0;
9220 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009221
9222 Py_DECREF(sub_obj);
9223 Py_DECREF(str_obj);
9224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 if (kind1 != kind)
9226 PyMem_Free(buf1);
9227 if (kind2 != kind)
9228 PyMem_Free(buf2);
9229
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 onError:
9232 Py_DECREF(sub_obj);
9233 Py_DECREF(str_obj);
9234 if (kind1 != kind && buf1)
9235 PyMem_Free(buf1);
9236 if (kind2 != kind && buf2)
9237 PyMem_Free(buf2);
9238 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239}
9240
Alexander Belopolsky40018472011-02-26 01:02:56 +00009241Py_ssize_t
9242PyUnicode_Find(PyObject *str,
9243 PyObject *sub,
9244 Py_ssize_t start,
9245 Py_ssize_t end,
9246 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009248 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009249
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009251 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009252 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009253 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009254 if (!sub) {
9255 Py_DECREF(str);
9256 return -2;
9257 }
9258 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9259 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009260 Py_DECREF(str);
9261 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 }
Tim Petersced69f82003-09-16 20:30:58 +00009263
Victor Stinner794d5672011-10-10 03:21:36 +02009264 result = any_find_slice(direction,
9265 str, sub, start, end
9266 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009267
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009269 Py_DECREF(sub);
9270
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271 return result;
9272}
9273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274Py_ssize_t
9275PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9276 Py_ssize_t start, Py_ssize_t end,
9277 int direction)
9278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009280 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 if (PyUnicode_READY(str) == -1)
9282 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009283 if (start < 0 || end < 0) {
9284 PyErr_SetString(PyExc_IndexError, "string index out of range");
9285 return -2;
9286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 if (end > PyUnicode_GET_LENGTH(str))
9288 end = PyUnicode_GET_LENGTH(str);
9289 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009290 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9291 kind, end-start, ch, direction);
9292 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009294 else
9295 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296}
9297
Alexander Belopolsky40018472011-02-26 01:02:56 +00009298static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009299tailmatch(PyObject *self,
9300 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009301 Py_ssize_t start,
9302 Py_ssize_t end,
9303 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 int kind_self;
9306 int kind_sub;
9307 void *data_self;
9308 void *data_sub;
9309 Py_ssize_t offset;
9310 Py_ssize_t i;
9311 Py_ssize_t end_sub;
9312
9313 if (PyUnicode_READY(self) == -1 ||
9314 PyUnicode_READY(substring) == -1)
9315 return 0;
9316
9317 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 return 1;
9319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9321 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 kind_self = PyUnicode_KIND(self);
9326 data_self = PyUnicode_DATA(self);
9327 kind_sub = PyUnicode_KIND(substring);
9328 data_sub = PyUnicode_DATA(substring);
9329 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9330
9331 if (direction > 0)
9332 offset = end;
9333 else
9334 offset = start;
9335
9336 if (PyUnicode_READ(kind_self, data_self, offset) ==
9337 PyUnicode_READ(kind_sub, data_sub, 0) &&
9338 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9339 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9340 /* If both are of the same kind, memcmp is sufficient */
9341 if (kind_self == kind_sub) {
9342 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009343 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 data_sub,
9345 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009346 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 }
9348 /* otherwise we have to compare each character by first accesing it */
9349 else {
9350 /* We do not need to compare 0 and len(substring)-1 because
9351 the if statement above ensured already that they are equal
9352 when we end up here. */
9353 // TODO: honor direction and do a forward or backwards search
9354 for (i = 1; i < end_sub; ++i) {
9355 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9356 PyUnicode_READ(kind_sub, data_sub, i))
9357 return 0;
9358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 }
9362
9363 return 0;
9364}
9365
Alexander Belopolsky40018472011-02-26 01:02:56 +00009366Py_ssize_t
9367PyUnicode_Tailmatch(PyObject *str,
9368 PyObject *substr,
9369 Py_ssize_t start,
9370 Py_ssize_t end,
9371 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009373 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009374
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 str = PyUnicode_FromObject(str);
9376 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009377 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 substr = PyUnicode_FromObject(substr);
9379 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 Py_DECREF(str);
9381 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 }
Tim Petersced69f82003-09-16 20:30:58 +00009383
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009384 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 Py_DECREF(str);
9387 Py_DECREF(substr);
9388 return result;
9389}
9390
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391/* Apply fixfct filter to the Unicode object self and return a
9392 reference to the modified object */
9393
Alexander Belopolsky40018472011-02-26 01:02:56 +00009394static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009395fixup(PyObject *self,
9396 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 PyObject *u;
9399 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009400 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009402 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009404 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009405 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 /* fix functions return the new maximum character in a string,
9408 if the kind of the resulting unicode object does not change,
9409 everything is fine. Otherwise we need to change the string kind
9410 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009411 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009412
9413 if (maxchar_new == 0) {
9414 /* no changes */;
9415 if (PyUnicode_CheckExact(self)) {
9416 Py_DECREF(u);
9417 Py_INCREF(self);
9418 return self;
9419 }
9420 else
9421 return u;
9422 }
9423
9424 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 maxchar_new = 127;
9426 else if (maxchar_new <= 255)
9427 maxchar_new = 255;
9428 else if (maxchar_new <= 65535)
9429 maxchar_new = 65535;
9430 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009431 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432
Victor Stinnereaab6042011-12-11 22:22:39 +01009433 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009435
9436 /* In case the maximum character changed, we need to
9437 convert the string to the new category. */
9438 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9439 if (v == NULL) {
9440 Py_DECREF(u);
9441 return NULL;
9442 }
9443 if (maxchar_new > maxchar_old) {
9444 /* If the maxchar increased so that the kind changed, not all
9445 characters are representable anymore and we need to fix the
9446 string again. This only happens in very few cases. */
9447 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9448 maxchar_old = fixfct(v);
9449 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 }
9451 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009452 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009454 Py_DECREF(u);
9455 assert(_PyUnicode_CheckConsistency(v, 1));
9456 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457}
9458
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009459static PyObject *
9460ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009462 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9463 char *resdata, *data = PyUnicode_DATA(self);
9464 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009465
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009466 res = PyUnicode_New(len, 127);
9467 if (res == NULL)
9468 return NULL;
9469 resdata = PyUnicode_DATA(res);
9470 if (lower)
9471 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009473 _Py_bytes_upper(resdata, data, len);
9474 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009478handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480 Py_ssize_t j;
9481 int final_sigma;
9482 Py_UCS4 c;
9483 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009484
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009485 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9486
9487 where ! is a negation and \p{xxx} is a character with property xxx.
9488 */
9489 for (j = i - 1; j >= 0; j--) {
9490 c = PyUnicode_READ(kind, data, j);
9491 if (!_PyUnicode_IsCaseIgnorable(c))
9492 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009494 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9495 if (final_sigma) {
9496 for (j = i + 1; j < length; j++) {
9497 c = PyUnicode_READ(kind, data, j);
9498 if (!_PyUnicode_IsCaseIgnorable(c))
9499 break;
9500 }
9501 final_sigma = j == length || !_PyUnicode_IsCased(c);
9502 }
9503 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504}
9505
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009506static int
9507lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9508 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009510 /* Obscure special case. */
9511 if (c == 0x3A3) {
9512 mapped[0] = handle_capital_sigma(kind, data, length, i);
9513 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009515 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516}
9517
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009518static Py_ssize_t
9519do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009521 Py_ssize_t i, k = 0;
9522 int n_res, j;
9523 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009524
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009525 c = PyUnicode_READ(kind, data, 0);
9526 n_res = _PyUnicode_ToUpperFull(c, mapped);
9527 for (j = 0; j < n_res; j++) {
9528 if (mapped[j] > *maxchar)
9529 *maxchar = mapped[j];
9530 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009532 for (i = 1; i < length; i++) {
9533 c = PyUnicode_READ(kind, data, i);
9534 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9535 for (j = 0; j < n_res; j++) {
9536 if (mapped[j] > *maxchar)
9537 *maxchar = mapped[j];
9538 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009539 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009540 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009541 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542}
9543
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009544static Py_ssize_t
9545do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9546 Py_ssize_t i, k = 0;
9547
9548 for (i = 0; i < length; i++) {
9549 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9550 int n_res, j;
9551 if (Py_UNICODE_ISUPPER(c)) {
9552 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9553 }
9554 else if (Py_UNICODE_ISLOWER(c)) {
9555 n_res = _PyUnicode_ToUpperFull(c, mapped);
9556 }
9557 else {
9558 n_res = 1;
9559 mapped[0] = c;
9560 }
9561 for (j = 0; j < n_res; j++) {
9562 if (mapped[j] > *maxchar)
9563 *maxchar = mapped[j];
9564 res[k++] = mapped[j];
9565 }
9566 }
9567 return k;
9568}
9569
9570static Py_ssize_t
9571do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9572 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009574 Py_ssize_t i, k = 0;
9575
9576 for (i = 0; i < length; i++) {
9577 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9578 int n_res, j;
9579 if (lower)
9580 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9581 else
9582 n_res = _PyUnicode_ToUpperFull(c, mapped);
9583 for (j = 0; j < n_res; j++) {
9584 if (mapped[j] > *maxchar)
9585 *maxchar = mapped[j];
9586 res[k++] = mapped[j];
9587 }
9588 }
9589 return k;
9590}
9591
9592static Py_ssize_t
9593do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9594{
9595 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9596}
9597
9598static Py_ssize_t
9599do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9600{
9601 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9602}
9603
Benjamin Petersone51757f2012-01-12 21:10:29 -05009604static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009605do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9606{
9607 Py_ssize_t i, k = 0;
9608
9609 for (i = 0; i < length; i++) {
9610 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9611 Py_UCS4 mapped[3];
9612 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9613 for (j = 0; j < n_res; j++) {
9614 if (mapped[j] > *maxchar)
9615 *maxchar = mapped[j];
9616 res[k++] = mapped[j];
9617 }
9618 }
9619 return k;
9620}
9621
9622static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009623do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9624{
9625 Py_ssize_t i, k = 0;
9626 int previous_is_cased;
9627
9628 previous_is_cased = 0;
9629 for (i = 0; i < length; i++) {
9630 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9631 Py_UCS4 mapped[3];
9632 int n_res, j;
9633
9634 if (previous_is_cased)
9635 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9636 else
9637 n_res = _PyUnicode_ToTitleFull(c, mapped);
9638
9639 for (j = 0; j < n_res; j++) {
9640 if (mapped[j] > *maxchar)
9641 *maxchar = mapped[j];
9642 res[k++] = mapped[j];
9643 }
9644
9645 previous_is_cased = _PyUnicode_IsCased(c);
9646 }
9647 return k;
9648}
9649
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650static PyObject *
9651case_operation(PyObject *self,
9652 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9653{
9654 PyObject *res = NULL;
9655 Py_ssize_t length, newlength = 0;
9656 int kind, outkind;
9657 void *data, *outdata;
9658 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9659
Benjamin Petersoneea48462012-01-16 14:28:50 -05009660 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661
9662 kind = PyUnicode_KIND(self);
9663 data = PyUnicode_DATA(self);
9664 length = PyUnicode_GET_LENGTH(self);
9665 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9666 if (tmp == NULL)
9667 return PyErr_NoMemory();
9668 newlength = perform(kind, data, length, tmp, &maxchar);
9669 res = PyUnicode_New(newlength, maxchar);
9670 if (res == NULL)
9671 goto leave;
9672 tmpend = tmp + newlength;
9673 outdata = PyUnicode_DATA(res);
9674 outkind = PyUnicode_KIND(res);
9675 switch (outkind) {
9676 case PyUnicode_1BYTE_KIND:
9677 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9678 break;
9679 case PyUnicode_2BYTE_KIND:
9680 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9681 break;
9682 case PyUnicode_4BYTE_KIND:
9683 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9684 break;
9685 default:
9686 assert(0);
9687 break;
9688 }
9689 leave:
9690 PyMem_FREE(tmp);
9691 return res;
9692}
9693
Tim Peters8ce9f162004-08-27 01:49:32 +00009694PyObject *
9695PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009698 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009700 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009701 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9702 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009703 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009705 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009707 int use_memcpy;
9708 unsigned char *res_data = NULL, *sep_data = NULL;
9709 PyObject *last_obj;
9710 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711
Tim Peters05eba1f2004-08-27 21:32:02 +00009712 fseq = PySequence_Fast(seq, "");
9713 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009714 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009715 }
9716
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009717 /* NOTE: the following code can't call back into Python code,
9718 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009719 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009720
Tim Peters05eba1f2004-08-27 21:32:02 +00009721 seqlen = PySequence_Fast_GET_SIZE(fseq);
9722 /* If empty sequence, return u"". */
9723 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009724 Py_DECREF(fseq);
9725 Py_INCREF(unicode_empty);
9726 res = unicode_empty;
9727 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009728 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009729
Tim Peters05eba1f2004-08-27 21:32:02 +00009730 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009731 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009732 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009733 if (seqlen == 1) {
9734 if (PyUnicode_CheckExact(items[0])) {
9735 res = items[0];
9736 Py_INCREF(res);
9737 Py_DECREF(fseq);
9738 return res;
9739 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009740 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009741 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009742 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009743 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009744 /* Set up sep and seplen */
9745 if (separator == NULL) {
9746 /* fall back to a blank space separator */
9747 sep = PyUnicode_FromOrdinal(' ');
9748 if (!sep)
9749 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009750 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009751 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009752 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009753 else {
9754 if (!PyUnicode_Check(separator)) {
9755 PyErr_Format(PyExc_TypeError,
9756 "separator: expected str instance,"
9757 " %.80s found",
9758 Py_TYPE(separator)->tp_name);
9759 goto onError;
9760 }
9761 if (PyUnicode_READY(separator))
9762 goto onError;
9763 sep = separator;
9764 seplen = PyUnicode_GET_LENGTH(separator);
9765 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9766 /* inc refcount to keep this code path symmetric with the
9767 above case of a blank separator */
9768 Py_INCREF(sep);
9769 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009770 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009771 }
9772
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009773 /* There are at least two things to join, or else we have a subclass
9774 * of str in the sequence.
9775 * Do a pre-pass to figure out the total amount of space we'll
9776 * need (sz), and see whether all argument are strings.
9777 */
9778 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009779#ifdef Py_DEBUG
9780 use_memcpy = 0;
9781#else
9782 use_memcpy = 1;
9783#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009784 for (i = 0; i < seqlen; i++) {
9785 const Py_ssize_t old_sz = sz;
9786 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009787 if (!PyUnicode_Check(item)) {
9788 PyErr_Format(PyExc_TypeError,
9789 "sequence item %zd: expected str instance,"
9790 " %.80s found",
9791 i, Py_TYPE(item)->tp_name);
9792 goto onError;
9793 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 if (PyUnicode_READY(item) == -1)
9795 goto onError;
9796 sz += PyUnicode_GET_LENGTH(item);
9797 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009798 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009799 if (i != 0)
9800 sz += seplen;
9801 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9802 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009803 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009804 goto onError;
9805 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009806 if (use_memcpy && last_obj != NULL) {
9807 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9808 use_memcpy = 0;
9809 }
9810 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009811 }
Tim Petersced69f82003-09-16 20:30:58 +00009812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009814 if (res == NULL)
9815 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009816
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009817 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009818#ifdef Py_DEBUG
9819 use_memcpy = 0;
9820#else
9821 if (use_memcpy) {
9822 res_data = PyUnicode_1BYTE_DATA(res);
9823 kind = PyUnicode_KIND(res);
9824 if (seplen != 0)
9825 sep_data = PyUnicode_1BYTE_DATA(sep);
9826 }
9827#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009829 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009830 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009831 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009832 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009833 if (use_memcpy) {
9834 Py_MEMCPY(res_data,
9835 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009836 kind * seplen);
9837 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009838 }
9839 else {
9840 copy_characters(res, res_offset, sep, 0, seplen);
9841 res_offset += seplen;
9842 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009843 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009844 itemlen = PyUnicode_GET_LENGTH(item);
9845 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009846 if (use_memcpy) {
9847 Py_MEMCPY(res_data,
9848 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009849 kind * itemlen);
9850 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009851 }
9852 else {
9853 copy_characters(res, res_offset, item, 0, itemlen);
9854 res_offset += itemlen;
9855 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009856 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009857 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009858 if (use_memcpy)
9859 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009860 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009861 else
9862 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009863
Tim Peters05eba1f2004-08-27 21:32:02 +00009864 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009866 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868
Benjamin Peterson29060642009-01-31 22:14:21 +00009869 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009870 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009872 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 return NULL;
9874}
9875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876#define FILL(kind, data, value, start, length) \
9877 do { \
9878 Py_ssize_t i_ = 0; \
9879 assert(kind != PyUnicode_WCHAR_KIND); \
9880 switch ((kind)) { \
9881 case PyUnicode_1BYTE_KIND: { \
9882 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9883 memset(to_, (unsigned char)value, length); \
9884 break; \
9885 } \
9886 case PyUnicode_2BYTE_KIND: { \
9887 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9888 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9889 break; \
9890 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009891 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9893 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9894 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009895 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 } \
9897 } \
9898 } while (0)
9899
Victor Stinner3fe55312012-01-04 00:33:50 +01009900Py_ssize_t
9901PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9902 Py_UCS4 fill_char)
9903{
9904 Py_ssize_t maxlen;
9905 enum PyUnicode_Kind kind;
9906 void *data;
9907
9908 if (!PyUnicode_Check(unicode)) {
9909 PyErr_BadInternalCall();
9910 return -1;
9911 }
9912 if (PyUnicode_READY(unicode) == -1)
9913 return -1;
9914 if (unicode_check_modifiable(unicode))
9915 return -1;
9916
9917 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9918 PyErr_SetString(PyExc_ValueError,
9919 "fill character is bigger than "
9920 "the string maximum character");
9921 return -1;
9922 }
9923
9924 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9925 length = Py_MIN(maxlen, length);
9926 if (length <= 0)
9927 return 0;
9928
9929 kind = PyUnicode_KIND(unicode);
9930 data = PyUnicode_DATA(unicode);
9931 FILL(kind, data, fill_char, start, length);
9932 return length;
9933}
9934
Victor Stinner9310abb2011-10-05 00:59:23 +02009935static PyObject *
9936pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009937 Py_ssize_t left,
9938 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 PyObject *u;
9942 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009943 int kind;
9944 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945
9946 if (left < 0)
9947 left = 0;
9948 if (right < 0)
9949 right = 0;
9950
Victor Stinnerc4b49542011-12-11 22:44:26 +01009951 if (left == 0 && right == 0)
9952 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9955 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009956 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9957 return NULL;
9958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9960 if (fill > maxchar)
9961 maxchar = fill;
9962 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009963 if (!u)
9964 return NULL;
9965
9966 kind = PyUnicode_KIND(u);
9967 data = PyUnicode_DATA(u);
9968 if (left)
9969 FILL(kind, data, fill, 0, left);
9970 if (right)
9971 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009972 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009973 assert(_PyUnicode_CheckConsistency(u, 1));
9974 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977
Alexander Belopolsky40018472011-02-26 01:02:56 +00009978PyObject *
9979PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982
9983 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009984 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009986 if (PyUnicode_READY(string) == -1) {
9987 Py_DECREF(string);
9988 return NULL;
9989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990
Benjamin Petersonead6b532011-12-20 17:23:42 -06009991 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009993 if (PyUnicode_IS_ASCII(string))
9994 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009995 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009996 PyUnicode_GET_LENGTH(string), keepends);
9997 else
9998 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009999 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010000 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 break;
10002 case PyUnicode_2BYTE_KIND:
10003 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010004 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 PyUnicode_GET_LENGTH(string), keepends);
10006 break;
10007 case PyUnicode_4BYTE_KIND:
10008 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010009 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 PyUnicode_GET_LENGTH(string), keepends);
10011 break;
10012 default:
10013 assert(0);
10014 list = 0;
10015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 Py_DECREF(string);
10017 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018}
10019
Alexander Belopolsky40018472011-02-26 01:02:56 +000010020static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010021split(PyObject *self,
10022 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010023 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 int kind1, kind2, kind;
10026 void *buf1, *buf2;
10027 Py_ssize_t len1, len2;
10028 PyObject* out;
10029
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010031 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 if (PyUnicode_READY(self) == -1)
10034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010037 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010039 if (PyUnicode_IS_ASCII(self))
10040 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010041 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010042 PyUnicode_GET_LENGTH(self), maxcount
10043 );
10044 else
10045 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010046 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010047 PyUnicode_GET_LENGTH(self), maxcount
10048 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 case PyUnicode_2BYTE_KIND:
10050 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010051 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 PyUnicode_GET_LENGTH(self), maxcount
10053 );
10054 case PyUnicode_4BYTE_KIND:
10055 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010056 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 PyUnicode_GET_LENGTH(self), maxcount
10058 );
10059 default:
10060 assert(0);
10061 return NULL;
10062 }
10063
10064 if (PyUnicode_READY(substring) == -1)
10065 return NULL;
10066
10067 kind1 = PyUnicode_KIND(self);
10068 kind2 = PyUnicode_KIND(substring);
10069 kind = kind1 > kind2 ? kind1 : kind2;
10070 buf1 = PyUnicode_DATA(self);
10071 buf2 = PyUnicode_DATA(substring);
10072 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010073 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 if (!buf1)
10075 return NULL;
10076 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010077 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 if (!buf2) {
10079 if (kind1 != kind) PyMem_Free(buf1);
10080 return NULL;
10081 }
10082 len1 = PyUnicode_GET_LENGTH(self);
10083 len2 = PyUnicode_GET_LENGTH(substring);
10084
Benjamin Petersonead6b532011-12-20 17:23:42 -060010085 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010087 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10088 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010089 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010090 else
10091 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010092 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 break;
10094 case PyUnicode_2BYTE_KIND:
10095 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010096 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 break;
10098 case PyUnicode_4BYTE_KIND:
10099 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010100 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 break;
10102 default:
10103 out = NULL;
10104 }
10105 if (kind1 != kind)
10106 PyMem_Free(buf1);
10107 if (kind2 != kind)
10108 PyMem_Free(buf2);
10109 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110}
10111
Alexander Belopolsky40018472011-02-26 01:02:56 +000010112static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010113rsplit(PyObject *self,
10114 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010115 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 int kind1, kind2, kind;
10118 void *buf1, *buf2;
10119 Py_ssize_t len1, len2;
10120 PyObject* out;
10121
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010122 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010123 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 if (PyUnicode_READY(self) == -1)
10126 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010129 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010131 if (PyUnicode_IS_ASCII(self))
10132 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010133 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010134 PyUnicode_GET_LENGTH(self), maxcount
10135 );
10136 else
10137 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010138 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010139 PyUnicode_GET_LENGTH(self), maxcount
10140 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 case PyUnicode_2BYTE_KIND:
10142 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010143 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 PyUnicode_GET_LENGTH(self), maxcount
10145 );
10146 case PyUnicode_4BYTE_KIND:
10147 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010148 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 PyUnicode_GET_LENGTH(self), maxcount
10150 );
10151 default:
10152 assert(0);
10153 return NULL;
10154 }
10155
10156 if (PyUnicode_READY(substring) == -1)
10157 return NULL;
10158
10159 kind1 = PyUnicode_KIND(self);
10160 kind2 = PyUnicode_KIND(substring);
10161 kind = kind1 > kind2 ? kind1 : kind2;
10162 buf1 = PyUnicode_DATA(self);
10163 buf2 = PyUnicode_DATA(substring);
10164 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010165 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 if (!buf1)
10167 return NULL;
10168 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010169 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 if (!buf2) {
10171 if (kind1 != kind) PyMem_Free(buf1);
10172 return NULL;
10173 }
10174 len1 = PyUnicode_GET_LENGTH(self);
10175 len2 = PyUnicode_GET_LENGTH(substring);
10176
Benjamin Petersonead6b532011-12-20 17:23:42 -060010177 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010179 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10180 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010181 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 else
10183 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010184 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 break;
10186 case PyUnicode_2BYTE_KIND:
10187 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 break;
10190 case PyUnicode_4BYTE_KIND:
10191 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 break;
10194 default:
10195 out = NULL;
10196 }
10197 if (kind1 != kind)
10198 PyMem_Free(buf1);
10199 if (kind2 != kind)
10200 PyMem_Free(buf2);
10201 return out;
10202}
10203
10204static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010205anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10206 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010208 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10211 return asciilib_find(buf1, len1, buf2, len2, offset);
10212 else
10213 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 case PyUnicode_2BYTE_KIND:
10215 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10216 case PyUnicode_4BYTE_KIND:
10217 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10218 }
10219 assert(0);
10220 return -1;
10221}
10222
10223static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10225 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010227 switch (kind) {
10228 case PyUnicode_1BYTE_KIND:
10229 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10230 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10231 else
10232 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10233 case PyUnicode_2BYTE_KIND:
10234 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10235 case PyUnicode_4BYTE_KIND:
10236 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10237 }
10238 assert(0);
10239 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010240}
10241
Alexander Belopolsky40018472011-02-26 01:02:56 +000010242static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243replace(PyObject *self, PyObject *str1,
10244 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 PyObject *u;
10247 char *sbuf = PyUnicode_DATA(self);
10248 char *buf1 = PyUnicode_DATA(str1);
10249 char *buf2 = PyUnicode_DATA(str2);
10250 int srelease = 0, release1 = 0, release2 = 0;
10251 int skind = PyUnicode_KIND(self);
10252 int kind1 = PyUnicode_KIND(str1);
10253 int kind2 = PyUnicode_KIND(str2);
10254 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10255 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10256 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010257 int mayshrink;
10258 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259
10260 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010261 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010263 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
Victor Stinner59de0ee2011-10-07 10:01:28 +020010265 if (str1 == str2)
10266 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (skind < kind1)
10268 /* substring too wide to be present */
10269 goto nothing;
10270
Victor Stinner49a0a212011-10-12 23:46:10 +020010271 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10272 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10273 /* Replacing str1 with str2 may cause a maxchar reduction in the
10274 result string. */
10275 mayshrink = (maxchar_str2 < maxchar);
10276 maxchar = Py_MAX(maxchar, maxchar_str2);
10277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010281 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010283 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010284 Py_UCS4 u1, u2;
10285 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010286 Py_ssize_t index, pos;
10287 char *src;
10288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010290 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10291 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010292 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010297 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010299
10300 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10301 index = 0;
10302 src = sbuf;
10303 while (--maxcount)
10304 {
10305 pos++;
10306 src += pos * PyUnicode_KIND(self);
10307 slen -= pos;
10308 index += pos;
10309 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10310 if (pos < 0)
10311 break;
10312 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10313 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010314 }
10315 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 int rkind = skind;
10317 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010318 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 if (kind1 < rkind) {
10321 /* widen substring */
10322 buf1 = _PyUnicode_AsKind(str1, rkind);
10323 if (!buf1) goto error;
10324 release1 = 1;
10325 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010326 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010327 if (i < 0)
10328 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (rkind > kind2) {
10330 /* widen replacement */
10331 buf2 = _PyUnicode_AsKind(str2, rkind);
10332 if (!buf2) goto error;
10333 release2 = 1;
10334 }
10335 else if (rkind < kind2) {
10336 /* widen self and buf1 */
10337 rkind = kind2;
10338 if (release1) PyMem_Free(buf1);
10339 sbuf = _PyUnicode_AsKind(self, rkind);
10340 if (!sbuf) goto error;
10341 srelease = 1;
10342 buf1 = _PyUnicode_AsKind(str1, rkind);
10343 if (!buf1) goto error;
10344 release1 = 1;
10345 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010346 u = PyUnicode_New(slen, maxchar);
10347 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010349 assert(PyUnicode_KIND(u) == rkind);
10350 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010351
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010352 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010353 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010356 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010358
10359 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010360 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010361 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010362 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010363 if (i == -1)
10364 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010365 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010367 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010371 }
10372 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 Py_ssize_t n, i, j, ires;
10374 Py_ssize_t product, new_size;
10375 int rkind = skind;
10376 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010379 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 buf1 = _PyUnicode_AsKind(str1, rkind);
10381 if (!buf1) goto error;
10382 release1 = 1;
10383 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010384 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010385 if (n == 0)
10386 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010388 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 buf2 = _PyUnicode_AsKind(str2, rkind);
10390 if (!buf2) goto error;
10391 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010394 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 rkind = kind2;
10396 sbuf = _PyUnicode_AsKind(self, rkind);
10397 if (!sbuf) goto error;
10398 srelease = 1;
10399 if (release1) PyMem_Free(buf1);
10400 buf1 = _PyUnicode_AsKind(str1, rkind);
10401 if (!buf1) goto error;
10402 release1 = 1;
10403 }
10404 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10405 PyUnicode_GET_LENGTH(str1))); */
10406 product = n * (len2-len1);
10407 if ((product / (len2-len1)) != n) {
10408 PyErr_SetString(PyExc_OverflowError,
10409 "replace string is too long");
10410 goto error;
10411 }
10412 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010413 if (new_size == 0) {
10414 Py_INCREF(unicode_empty);
10415 u = unicode_empty;
10416 goto done;
10417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10419 PyErr_SetString(PyExc_OverflowError,
10420 "replace string is too long");
10421 goto error;
10422 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010423 u = PyUnicode_New(new_size, maxchar);
10424 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010426 assert(PyUnicode_KIND(u) == rkind);
10427 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 ires = i = 0;
10429 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430 while (n-- > 0) {
10431 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010432 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010433 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010434 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010435 if (j == -1)
10436 break;
10437 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010438 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010439 memcpy(res + rkind * ires,
10440 sbuf + rkind * i,
10441 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010443 }
10444 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010446 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010448 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010455 memcpy(res + rkind * ires,
10456 sbuf + rkind * i,
10457 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010458 }
10459 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010460 /* interleave */
10461 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010462 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010464 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010466 if (--n <= 0)
10467 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010468 memcpy(res + rkind * ires,
10469 sbuf + rkind * i,
10470 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 ires++;
10472 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010474 memcpy(res + rkind * ires,
10475 sbuf + rkind * i,
10476 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010477 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010478 }
10479
10480 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010481 unicode_adjust_maxchar(&u);
10482 if (u == NULL)
10483 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010485
10486 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 if (srelease)
10488 PyMem_FREE(sbuf);
10489 if (release1)
10490 PyMem_FREE(buf1);
10491 if (release2)
10492 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010493 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010495
Benjamin Peterson29060642009-01-31 22:14:21 +000010496 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (srelease)
10499 PyMem_FREE(sbuf);
10500 if (release1)
10501 PyMem_FREE(buf1);
10502 if (release2)
10503 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010504 return unicode_result_unchanged(self);
10505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 error:
10507 if (srelease && sbuf)
10508 PyMem_FREE(sbuf);
10509 if (release1 && buf1)
10510 PyMem_FREE(buf1);
10511 if (release2 && buf2)
10512 PyMem_FREE(buf2);
10513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514}
10515
10516/* --- Unicode Object Methods --------------------------------------------- */
10517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010518PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520\n\
10521Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010522characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523
10524static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010525unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010527 if (PyUnicode_READY(self) == -1)
10528 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010529 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530}
10531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010532PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010533 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534\n\
10535Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010536have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537
10538static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010539unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010541 if (PyUnicode_READY(self) == -1)
10542 return NULL;
10543 if (PyUnicode_GET_LENGTH(self) == 0)
10544 return unicode_result_unchanged(self);
10545 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546}
10547
Benjamin Petersond5890c82012-01-14 13:23:30 -050010548PyDoc_STRVAR(casefold__doc__,
10549 "S.casefold() -> str\n\
10550\n\
10551Return a version of S suitable for caseless comparisons.");
10552
10553static PyObject *
10554unicode_casefold(PyObject *self)
10555{
10556 if (PyUnicode_READY(self) == -1)
10557 return NULL;
10558 if (PyUnicode_IS_ASCII(self))
10559 return ascii_upper_or_lower(self, 1);
10560 return case_operation(self, do_casefold);
10561}
10562
10563
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010564/* Argument converter. Coerces to a single unicode character */
10565
10566static int
10567convert_uc(PyObject *obj, void *addr)
10568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010570 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010571
Benjamin Peterson14339b62009-01-31 16:36:08 +000010572 uniobj = PyUnicode_FromObject(obj);
10573 if (uniobj == NULL) {
10574 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010575 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010576 return 0;
10577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010579 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010580 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010581 Py_DECREF(uniobj);
10582 return 0;
10583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010585 Py_DECREF(uniobj);
10586 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010587}
10588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010589PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010590 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010592Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010593done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594
10595static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010596unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010598 Py_ssize_t marg, left;
10599 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 Py_UCS4 fillchar = ' ';
10601
Victor Stinnere9a29352011-10-01 02:14:59 +020010602 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Benjamin Petersonbac79492012-01-14 13:34:47 -050010605 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606 return NULL;
10607
Victor Stinnerc4b49542011-12-11 22:44:26 +010010608 if (PyUnicode_GET_LENGTH(self) >= width)
10609 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610
Victor Stinnerc4b49542011-12-11 22:44:26 +010010611 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 left = marg / 2 + (marg & width & 1);
10613
Victor Stinner9310abb2011-10-05 00:59:23 +020010614 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615}
10616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617/* This function assumes that str1 and str2 are readied by the caller. */
10618
Marc-André Lemburge5034372000-08-08 08:04:29 +000010619static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010620unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 int kind1, kind2;
10623 void *data1, *data2;
10624 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 kind1 = PyUnicode_KIND(str1);
10627 kind2 = PyUnicode_KIND(str2);
10628 data1 = PyUnicode_DATA(str1);
10629 data2 = PyUnicode_DATA(str2);
10630 len1 = PyUnicode_GET_LENGTH(str1);
10631 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 for (i = 0; i < len1 && i < len2; ++i) {
10634 Py_UCS4 c1, c2;
10635 c1 = PyUnicode_READ(kind1, data1, i);
10636 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010637
10638 if (c1 != c2)
10639 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010640 }
10641
10642 return (len1 < len2) ? -1 : (len1 != len2);
10643}
10644
Alexander Belopolsky40018472011-02-26 01:02:56 +000010645int
10646PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10649 if (PyUnicode_READY(left) == -1 ||
10650 PyUnicode_READY(right) == -1)
10651 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010652 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010654 PyErr_Format(PyExc_TypeError,
10655 "Can't compare %.100s and %.100s",
10656 left->ob_type->tp_name,
10657 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658 return -1;
10659}
10660
Martin v. Löwis5b222132007-06-10 09:51:05 +000010661int
10662PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 Py_ssize_t i;
10665 int kind;
10666 void *data;
10667 Py_UCS4 chr;
10668
Victor Stinner910337b2011-10-03 03:20:16 +020010669 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 if (PyUnicode_READY(uni) == -1)
10671 return -1;
10672 kind = PyUnicode_KIND(uni);
10673 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010674 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10676 if (chr != str[i])
10677 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010678 /* This check keeps Python strings that end in '\0' from comparing equal
10679 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010681 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010682 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010683 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010684 return 0;
10685}
10686
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010687
Benjamin Peterson29060642009-01-31 22:14:21 +000010688#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010689 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010690
Alexander Belopolsky40018472011-02-26 01:02:56 +000010691PyObject *
10692PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010693{
10694 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010695
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010696 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10697 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 if (PyUnicode_READY(left) == -1 ||
10699 PyUnicode_READY(right) == -1)
10700 return NULL;
10701 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10702 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010703 if (op == Py_EQ) {
10704 Py_INCREF(Py_False);
10705 return Py_False;
10706 }
10707 if (op == Py_NE) {
10708 Py_INCREF(Py_True);
10709 return Py_True;
10710 }
10711 }
10712 if (left == right)
10713 result = 0;
10714 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010715 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010716
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010717 /* Convert the return value to a Boolean */
10718 switch (op) {
10719 case Py_EQ:
10720 v = TEST_COND(result == 0);
10721 break;
10722 case Py_NE:
10723 v = TEST_COND(result != 0);
10724 break;
10725 case Py_LE:
10726 v = TEST_COND(result <= 0);
10727 break;
10728 case Py_GE:
10729 v = TEST_COND(result >= 0);
10730 break;
10731 case Py_LT:
10732 v = TEST_COND(result == -1);
10733 break;
10734 case Py_GT:
10735 v = TEST_COND(result == 1);
10736 break;
10737 default:
10738 PyErr_BadArgument();
10739 return NULL;
10740 }
10741 Py_INCREF(v);
10742 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010744
Brian Curtindfc80e32011-08-10 20:28:54 -050010745 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010746}
10747
Alexander Belopolsky40018472011-02-26 01:02:56 +000010748int
10749PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010750{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010751 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 int kind1, kind2, kind;
10753 void *buf1, *buf2;
10754 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010755 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010756
10757 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010758 sub = PyUnicode_FromObject(element);
10759 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 PyErr_Format(PyExc_TypeError,
10761 "'in <string>' requires string as left operand, not %s",
10762 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010763 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010764 }
10765
Thomas Wouters477c8d52006-05-27 19:21:47 +000010766 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010767 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010768 Py_DECREF(sub);
10769 return -1;
10770 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010771 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10772 Py_DECREF(sub);
10773 Py_DECREF(str);
10774 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 kind1 = PyUnicode_KIND(str);
10777 kind2 = PyUnicode_KIND(sub);
10778 kind = kind1 > kind2 ? kind1 : kind2;
10779 buf1 = PyUnicode_DATA(str);
10780 buf2 = PyUnicode_DATA(sub);
10781 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010782 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if (!buf1) {
10784 Py_DECREF(sub);
10785 return -1;
10786 }
10787 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010788 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 if (!buf2) {
10790 Py_DECREF(sub);
10791 if (kind1 != kind) PyMem_Free(buf1);
10792 return -1;
10793 }
10794 len1 = PyUnicode_GET_LENGTH(str);
10795 len2 = PyUnicode_GET_LENGTH(sub);
10796
Benjamin Petersonead6b532011-12-20 17:23:42 -060010797 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 case PyUnicode_1BYTE_KIND:
10799 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10800 break;
10801 case PyUnicode_2BYTE_KIND:
10802 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10803 break;
10804 case PyUnicode_4BYTE_KIND:
10805 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10806 break;
10807 default:
10808 result = -1;
10809 assert(0);
10810 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010811
10812 Py_DECREF(str);
10813 Py_DECREF(sub);
10814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 if (kind1 != kind)
10816 PyMem_Free(buf1);
10817 if (kind2 != kind)
10818 PyMem_Free(buf2);
10819
Guido van Rossum403d68b2000-03-13 15:55:09 +000010820 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010821}
10822
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823/* Concat to string or Unicode object giving a new Unicode object. */
10824
Alexander Belopolsky40018472011-02-26 01:02:56 +000010825PyObject *
10826PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010829 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010830 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831
10832 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010835 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010838 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
10840 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010841 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010842 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010845 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010846 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848 }
10849
Victor Stinner488fa492011-12-12 00:01:39 +010010850 u_len = PyUnicode_GET_LENGTH(u);
10851 v_len = PyUnicode_GET_LENGTH(v);
10852 if (u_len > PY_SSIZE_T_MAX - v_len) {
10853 PyErr_SetString(PyExc_OverflowError,
10854 "strings are too large to concat");
10855 goto onError;
10856 }
10857 new_len = u_len + v_len;
10858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010860 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10861 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010864 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010866 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010867 copy_characters(w, 0, u, 0, u_len);
10868 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869 Py_DECREF(u);
10870 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010871 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875 Py_XDECREF(u);
10876 Py_XDECREF(v);
10877 return NULL;
10878}
10879
Walter Dörwald1ab83302007-05-18 17:15:44 +000010880void
Victor Stinner23e56682011-10-03 03:54:37 +020010881PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010882{
Victor Stinner23e56682011-10-03 03:54:37 +020010883 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010884 Py_UCS4 maxchar, maxchar2;
10885 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010886
10887 if (p_left == NULL) {
10888 if (!PyErr_Occurred())
10889 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010890 return;
10891 }
Victor Stinner23e56682011-10-03 03:54:37 +020010892 left = *p_left;
10893 if (right == NULL || !PyUnicode_Check(left)) {
10894 if (!PyErr_Occurred())
10895 PyErr_BadInternalCall();
10896 goto error;
10897 }
10898
Benjamin Petersonbac79492012-01-14 13:34:47 -050010899 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010900 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010901 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010902 goto error;
10903
Victor Stinner488fa492011-12-12 00:01:39 +010010904 /* Shortcuts */
10905 if (left == unicode_empty) {
10906 Py_DECREF(left);
10907 Py_INCREF(right);
10908 *p_left = right;
10909 return;
10910 }
10911 if (right == unicode_empty)
10912 return;
10913
10914 left_len = PyUnicode_GET_LENGTH(left);
10915 right_len = PyUnicode_GET_LENGTH(right);
10916 if (left_len > PY_SSIZE_T_MAX - right_len) {
10917 PyErr_SetString(PyExc_OverflowError,
10918 "strings are too large to concat");
10919 goto error;
10920 }
10921 new_len = left_len + right_len;
10922
10923 if (unicode_modifiable(left)
10924 && PyUnicode_CheckExact(right)
10925 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010926 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10927 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010928 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010929 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010930 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10931 {
10932 /* append inplace */
10933 if (unicode_resize(p_left, new_len) != 0) {
10934 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10935 * deallocated so it cannot be put back into
10936 * 'variable'. The MemoryError is raised when there
10937 * is no value in 'variable', which might (very
10938 * remotely) be a cause of incompatibilities.
10939 */
10940 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010941 }
Victor Stinner488fa492011-12-12 00:01:39 +010010942 /* copy 'right' into the newly allocated area of 'left' */
10943 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010944 }
Victor Stinner488fa492011-12-12 00:01:39 +010010945 else {
10946 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10947 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10948 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010949
Victor Stinner488fa492011-12-12 00:01:39 +010010950 /* Concat the two Unicode strings */
10951 res = PyUnicode_New(new_len, maxchar);
10952 if (res == NULL)
10953 goto error;
10954 copy_characters(res, 0, left, 0, left_len);
10955 copy_characters(res, left_len, right, 0, right_len);
10956 Py_DECREF(left);
10957 *p_left = res;
10958 }
10959 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010960 return;
10961
10962error:
Victor Stinner488fa492011-12-12 00:01:39 +010010963 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010964}
10965
10966void
10967PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10968{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010969 PyUnicode_Append(pleft, right);
10970 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010971}
10972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010973PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010974 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010976Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010977string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010978interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979
10980static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010981unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010983 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010984 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010985 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 int kind1, kind2, kind;
10988 void *buf1, *buf2;
10989 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990
Jesus Ceaac451502011-04-20 17:09:23 +020010991 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10992 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 kind1 = PyUnicode_KIND(self);
10996 kind2 = PyUnicode_KIND(substring);
10997 kind = kind1 > kind2 ? kind1 : kind2;
10998 buf1 = PyUnicode_DATA(self);
10999 buf2 = PyUnicode_DATA(substring);
11000 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011001 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 if (!buf1) {
11003 Py_DECREF(substring);
11004 return NULL;
11005 }
11006 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011007 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (!buf2) {
11009 Py_DECREF(substring);
11010 if (kind1 != kind) PyMem_Free(buf1);
11011 return NULL;
11012 }
11013 len1 = PyUnicode_GET_LENGTH(self);
11014 len2 = PyUnicode_GET_LENGTH(substring);
11015
11016 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011017 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 case PyUnicode_1BYTE_KIND:
11019 iresult = ucs1lib_count(
11020 ((Py_UCS1*)buf1) + start, end - start,
11021 buf2, len2, PY_SSIZE_T_MAX
11022 );
11023 break;
11024 case PyUnicode_2BYTE_KIND:
11025 iresult = ucs2lib_count(
11026 ((Py_UCS2*)buf1) + start, end - start,
11027 buf2, len2, PY_SSIZE_T_MAX
11028 );
11029 break;
11030 case PyUnicode_4BYTE_KIND:
11031 iresult = ucs4lib_count(
11032 ((Py_UCS4*)buf1) + start, end - start,
11033 buf2, len2, PY_SSIZE_T_MAX
11034 );
11035 break;
11036 default:
11037 assert(0); iresult = 0;
11038 }
11039
11040 result = PyLong_FromSsize_t(iresult);
11041
11042 if (kind1 != kind)
11043 PyMem_Free(buf1);
11044 if (kind2 != kind)
11045 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046
11047 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011048
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049 return result;
11050}
11051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011052PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011053 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011055Encode S using the codec registered for encoding. Default encoding\n\
11056is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011057handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011058a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11059'xmlcharrefreplace' as well as any other name registered with\n\
11060codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061
11062static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011063unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011065 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066 char *encoding = NULL;
11067 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011068
Benjamin Peterson308d6372009-09-18 21:42:35 +000011069 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11070 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011072 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011073}
11074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011075PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011076 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077\n\
11078Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011079If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080
11081static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011082unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011084 Py_ssize_t i, j, line_pos, src_len, incr;
11085 Py_UCS4 ch;
11086 PyObject *u;
11087 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011089 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011090 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091
11092 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094
Antoine Pitrou22425222011-10-04 19:10:51 +020011095 if (PyUnicode_READY(self) == -1)
11096 return NULL;
11097
Thomas Wouters7e474022000-07-16 12:04:32 +000011098 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011099 src_len = PyUnicode_GET_LENGTH(self);
11100 i = j = line_pos = 0;
11101 kind = PyUnicode_KIND(self);
11102 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011103 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011104 for (; i < src_len; i++) {
11105 ch = PyUnicode_READ(kind, src_data, i);
11106 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011107 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011108 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011109 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011110 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011111 goto overflow;
11112 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011114 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011115 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011118 goto overflow;
11119 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011121 if (ch == '\n' || ch == '\r')
11122 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011124 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011125 if (!found)
11126 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011127
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011129 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130 if (!u)
11131 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011132 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133
Antoine Pitroue71d5742011-10-04 15:55:09 +020011134 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
Antoine Pitroue71d5742011-10-04 15:55:09 +020011136 for (; i < src_len; i++) {
11137 ch = PyUnicode_READ(kind, src_data, i);
11138 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011140 incr = tabsize - (line_pos % tabsize);
11141 line_pos += incr;
11142 while (incr--) {
11143 PyUnicode_WRITE(kind, dest_data, j, ' ');
11144 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011145 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011147 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011149 line_pos++;
11150 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011151 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011152 if (ch == '\n' || ch == '\r')
11153 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011155 }
11156 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011157 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011158
Antoine Pitroue71d5742011-10-04 15:55:09 +020011159 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011160 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11161 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162}
11163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011164PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011165 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166\n\
11167Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011168such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169arguments start and end are interpreted as in slice notation.\n\
11170\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011171Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172
11173static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011176 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011177 Py_ssize_t start;
11178 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
Jesus Ceaac451502011-04-20 17:09:23 +020011181 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11182 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 if (PyUnicode_READY(self) == -1)
11186 return NULL;
11187 if (PyUnicode_READY(substring) == -1)
11188 return NULL;
11189
Victor Stinner7931d9a2011-11-04 00:22:48 +010011190 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
11192 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 if (result == -2)
11195 return NULL;
11196
Christian Heimes217cfd12007-12-02 14:31:20 +000011197 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198}
11199
11200static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011201unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011203 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11204 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207}
11208
Guido van Rossumc2504932007-09-18 19:42:40 +000011209/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011210 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011211static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011212unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213{
Guido van Rossumc2504932007-09-18 19:42:40 +000011214 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011215 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 if (_PyUnicode_HASH(self) != -1)
11218 return _PyUnicode_HASH(self);
11219 if (PyUnicode_READY(self) == -1)
11220 return -1;
11221 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011222 /*
11223 We make the hash of the empty string be 0, rather than using
11224 (prefix ^ suffix), since this slightly obfuscates the hash secret
11225 */
11226 if (len == 0) {
11227 _PyUnicode_HASH(self) = 0;
11228 return 0;
11229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230
11231 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011232#define HASH(P) \
11233 x ^= (Py_uhash_t) *P << 7; \
11234 while (--len >= 0) \
11235 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236
Georg Brandl2fb477c2012-02-21 00:33:36 +010011237 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 switch (PyUnicode_KIND(self)) {
11239 case PyUnicode_1BYTE_KIND: {
11240 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11241 HASH(c);
11242 break;
11243 }
11244 case PyUnicode_2BYTE_KIND: {
11245 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11246 HASH(s);
11247 break;
11248 }
11249 default: {
11250 Py_UCS4 *l;
11251 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11252 "Impossible switch case in unicode_hash");
11253 l = PyUnicode_4BYTE_DATA(self);
11254 HASH(l);
11255 break;
11256 }
11257 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011258 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11259 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260
Guido van Rossumc2504932007-09-18 19:42:40 +000011261 if (x == -1)
11262 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011264 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011276 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011277 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011278 Py_ssize_t start;
11279 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Jesus Ceaac451502011-04-20 17:09:23 +020011281 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11282 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 if (PyUnicode_READY(self) == -1)
11286 return NULL;
11287 if (PyUnicode_READY(substring) == -1)
11288 return NULL;
11289
Victor Stinner7931d9a2011-11-04 00:22:48 +010011290 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
11292 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 if (result == -2)
11295 return NULL;
11296
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297 if (result < 0) {
11298 PyErr_SetString(PyExc_ValueError, "substring not found");
11299 return NULL;
11300 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011301
Christian Heimes217cfd12007-12-02 14:31:20 +000011302 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303}
11304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011308Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011309at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310
11311static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011312unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 Py_ssize_t i, length;
11315 int kind;
11316 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 int cased;
11318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (PyUnicode_READY(self) == -1)
11320 return NULL;
11321 length = PyUnicode_GET_LENGTH(self);
11322 kind = PyUnicode_KIND(self);
11323 data = PyUnicode_DATA(self);
11324
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 if (length == 1)
11327 return PyBool_FromLong(
11328 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011330 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011333
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 for (i = 0; i < length; i++) {
11336 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011337
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11339 return PyBool_FromLong(0);
11340 else if (!cased && Py_UNICODE_ISLOWER(ch))
11341 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011343 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344}
11345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011346PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011349Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011350at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
11352static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011353unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 Py_ssize_t i, length;
11356 int kind;
11357 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 int cased;
11359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (PyUnicode_READY(self) == -1)
11361 return NULL;
11362 length = PyUnicode_GET_LENGTH(self);
11363 kind = PyUnicode_KIND(self);
11364 data = PyUnicode_DATA(self);
11365
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 if (length == 1)
11368 return PyBool_FromLong(
11369 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011371 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011373 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011374
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 for (i = 0; i < length; i++) {
11377 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011378
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11380 return PyBool_FromLong(0);
11381 else if (!cased && Py_UNICODE_ISUPPER(ch))
11382 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011384 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385}
11386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011387PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011390Return True if S is a titlecased string and there is at least one\n\
11391character in S, i.e. upper- and titlecase characters may only\n\
11392follow uncased characters and lowercase characters only cased ones.\n\
11393Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394
11395static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011396unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 Py_ssize_t i, length;
11399 int kind;
11400 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401 int cased, previous_is_cased;
11402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 if (PyUnicode_READY(self) == -1)
11404 return NULL;
11405 length = PyUnicode_GET_LENGTH(self);
11406 kind = PyUnicode_KIND(self);
11407 data = PyUnicode_DATA(self);
11408
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (length == 1) {
11411 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11412 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11413 (Py_UNICODE_ISUPPER(ch) != 0));
11414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011416 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011419
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420 cased = 0;
11421 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 for (i = 0; i < length; i++) {
11423 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011424
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11426 if (previous_is_cased)
11427 return PyBool_FromLong(0);
11428 previous_is_cased = 1;
11429 cased = 1;
11430 }
11431 else if (Py_UNICODE_ISLOWER(ch)) {
11432 if (!previous_is_cased)
11433 return PyBool_FromLong(0);
11434 previous_is_cased = 1;
11435 cased = 1;
11436 }
11437 else
11438 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011440 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441}
11442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011446Return True if all characters in S are whitespace\n\
11447and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
11449static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011450unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 Py_ssize_t i, length;
11453 int kind;
11454 void *data;
11455
11456 if (PyUnicode_READY(self) == -1)
11457 return NULL;
11458 length = PyUnicode_GET_LENGTH(self);
11459 kind = PyUnicode_KIND(self);
11460 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 1)
11464 return PyBool_FromLong(
11465 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011467 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 for (i = 0; i < length; i++) {
11472 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011473 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011476 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477}
11478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011479PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011481\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011482Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011484
11485static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011486unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 Py_ssize_t i, length;
11489 int kind;
11490 void *data;
11491
11492 if (PyUnicode_READY(self) == -1)
11493 return NULL;
11494 length = PyUnicode_GET_LENGTH(self);
11495 kind = PyUnicode_KIND(self);
11496 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011497
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011498 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (length == 1)
11500 return PyBool_FromLong(
11501 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011502
11503 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 for (i = 0; i < length; i++) {
11508 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011510 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011511 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011512}
11513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011514PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011515 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011516\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011517Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011518and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011519
11520static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011521unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 int kind;
11524 void *data;
11525 Py_ssize_t len, i;
11526
11527 if (PyUnicode_READY(self) == -1)
11528 return NULL;
11529
11530 kind = PyUnicode_KIND(self);
11531 data = PyUnicode_DATA(self);
11532 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011533
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011534 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (len == 1) {
11536 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11537 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11538 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011539
11540 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 for (i = 0; i < len; i++) {
11545 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011546 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011548 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011549 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011550}
11551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011552PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011555Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011556False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557
11558static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011559unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 Py_ssize_t i, length;
11562 int kind;
11563 void *data;
11564
11565 if (PyUnicode_READY(self) == -1)
11566 return NULL;
11567 length = PyUnicode_GET_LENGTH(self);
11568 kind = PyUnicode_KIND(self);
11569 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 if (length == 1)
11573 return PyBool_FromLong(
11574 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011576 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 for (i = 0; i < length; i++) {
11581 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011584 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585}
11586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011587PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011590Return True if all characters in S are digits\n\
11591and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
11593static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011594unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 Py_ssize_t i, length;
11597 int kind;
11598 void *data;
11599
11600 if (PyUnicode_READY(self) == -1)
11601 return NULL;
11602 length = PyUnicode_GET_LENGTH(self);
11603 kind = PyUnicode_KIND(self);
11604 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 if (length == 1) {
11608 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11609 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011612 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 for (i = 0; i < length; i++) {
11617 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011620 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011626Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011627False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
11629static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011630unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 Py_ssize_t i, length;
11633 int kind;
11634 void *data;
11635
11636 if (PyUnicode_READY(self) == -1)
11637 return NULL;
11638 length = PyUnicode_GET_LENGTH(self);
11639 kind = PyUnicode_KIND(self);
11640 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 if (length == 1)
11644 return PyBool_FromLong(
11645 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011647 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 for (i = 0; i < length; i++) {
11652 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011655 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656}
11657
Martin v. Löwis47383402007-08-15 07:32:56 +000011658int
11659PyUnicode_IsIdentifier(PyObject *self)
11660{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 int kind;
11662 void *data;
11663 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011664 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 if (PyUnicode_READY(self) == -1) {
11667 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 }
11670
11671 /* Special case for empty strings */
11672 if (PyUnicode_GET_LENGTH(self) == 0)
11673 return 0;
11674 kind = PyUnicode_KIND(self);
11675 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011676
11677 /* PEP 3131 says that the first character must be in
11678 XID_Start and subsequent characters in XID_Continue,
11679 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011680 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011681 letters, digits, underscore). However, given the current
11682 definition of XID_Start and XID_Continue, it is sufficient
11683 to check just for these, except that _ must be allowed
11684 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011686 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011687 return 0;
11688
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011689 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011692 return 1;
11693}
11694
11695PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011697\n\
11698Return True if S is a valid identifier according\n\
11699to the language definition.");
11700
11701static PyObject*
11702unicode_isidentifier(PyObject *self)
11703{
11704 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11705}
11706
Georg Brandl559e5d72008-06-11 18:37:52 +000011707PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011709\n\
11710Return True if all characters in S are considered\n\
11711printable in repr() or S is empty, False otherwise.");
11712
11713static PyObject*
11714unicode_isprintable(PyObject *self)
11715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 Py_ssize_t i, length;
11717 int kind;
11718 void *data;
11719
11720 if (PyUnicode_READY(self) == -1)
11721 return NULL;
11722 length = PyUnicode_GET_LENGTH(self);
11723 kind = PyUnicode_KIND(self);
11724 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011725
11726 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 if (length == 1)
11728 return PyBool_FromLong(
11729 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 for (i = 0; i < length; i++) {
11732 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011733 Py_RETURN_FALSE;
11734 }
11735 }
11736 Py_RETURN_TRUE;
11737}
11738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011739PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011740 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741\n\
11742Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011743iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
11745static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011746unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011748 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749}
11750
Martin v. Löwis18e16552006-02-15 17:27:45 +000011751static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011752unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 if (PyUnicode_READY(self) == -1)
11755 return -1;
11756 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757}
11758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011759PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011762Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011763done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
11765static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011766unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011768 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 Py_UCS4 fillchar = ' ';
11770
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011771 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772 return NULL;
11773
Benjamin Petersonbac79492012-01-14 13:34:47 -050011774 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
Victor Stinnerc4b49542011-12-11 22:44:26 +010011777 if (PyUnicode_GET_LENGTH(self) >= width)
11778 return unicode_result_unchanged(self);
11779
11780 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781}
11782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011783PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011786Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787
11788static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011789unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011791 if (PyUnicode_READY(self) == -1)
11792 return NULL;
11793 if (PyUnicode_IS_ASCII(self))
11794 return ascii_upper_or_lower(self, 1);
11795 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796}
11797
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011798#define LEFTSTRIP 0
11799#define RIGHTSTRIP 1
11800#define BOTHSTRIP 2
11801
11802/* Arrays indexed by above */
11803static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11804
11805#define STRIPNAME(i) (stripformat[i]+3)
11806
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807/* externally visible for str.strip(unicode) */
11808PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011809_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 void *data;
11812 int kind;
11813 Py_ssize_t i, j, len;
11814 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11817 return NULL;
11818
11819 kind = PyUnicode_KIND(self);
11820 data = PyUnicode_DATA(self);
11821 len = PyUnicode_GET_LENGTH(self);
11822 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11823 PyUnicode_DATA(sepobj),
11824 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011825
Benjamin Peterson14339b62009-01-31 16:36:08 +000011826 i = 0;
11827 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 while (i < len &&
11829 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 i++;
11831 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011832 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011833
Benjamin Peterson14339b62009-01-31 16:36:08 +000011834 j = len;
11835 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 do {
11837 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 } while (j >= i &&
11839 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011841 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011842
Victor Stinner7931d9a2011-11-04 00:22:48 +010011843 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844}
11845
11846PyObject*
11847PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11848{
11849 unsigned char *data;
11850 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011851 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852
Victor Stinnerde636f32011-10-01 03:55:54 +020011853 if (PyUnicode_READY(self) == -1)
11854 return NULL;
11855
11856 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11857
Victor Stinner12bab6d2011-10-01 01:53:49 +020011858 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011859 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860
Victor Stinner12bab6d2011-10-01 01:53:49 +020011861 length = end - start;
11862 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011863 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864
Victor Stinnerde636f32011-10-01 03:55:54 +020011865 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011866 PyErr_SetString(PyExc_IndexError, "string index out of range");
11867 return NULL;
11868 }
11869
Victor Stinnerb9275c12011-10-05 14:01:42 +020011870 if (PyUnicode_IS_ASCII(self)) {
11871 kind = PyUnicode_KIND(self);
11872 data = PyUnicode_1BYTE_DATA(self);
11873 return unicode_fromascii(data + start, length);
11874 }
11875 else {
11876 kind = PyUnicode_KIND(self);
11877 data = PyUnicode_1BYTE_DATA(self);
11878 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011879 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011880 length);
11881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883
11884static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011885do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 int kind;
11888 void *data;
11889 Py_ssize_t len, i, j;
11890
11891 if (PyUnicode_READY(self) == -1)
11892 return NULL;
11893
11894 kind = PyUnicode_KIND(self);
11895 data = PyUnicode_DATA(self);
11896 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011897
Benjamin Peterson14339b62009-01-31 16:36:08 +000011898 i = 0;
11899 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011901 i++;
11902 }
11903 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011904
Benjamin Peterson14339b62009-01-31 16:36:08 +000011905 j = len;
11906 if (striptype != LEFTSTRIP) {
11907 do {
11908 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011910 j++;
11911 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011912
Victor Stinner7931d9a2011-11-04 00:22:48 +010011913 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914}
11915
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011916
11917static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011918do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011919{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011920 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011921
Benjamin Peterson14339b62009-01-31 16:36:08 +000011922 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11923 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011924
Benjamin Peterson14339b62009-01-31 16:36:08 +000011925 if (sep != NULL && sep != Py_None) {
11926 if (PyUnicode_Check(sep))
11927 return _PyUnicode_XStrip(self, striptype, sep);
11928 else {
11929 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 "%s arg must be None or str",
11931 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011932 return NULL;
11933 }
11934 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011935
Benjamin Peterson14339b62009-01-31 16:36:08 +000011936 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011937}
11938
11939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011940PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011942\n\
11943Return a copy of the string S with leading and trailing\n\
11944whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011945If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011946
11947static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011948unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011949{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011950 if (PyTuple_GET_SIZE(args) == 0)
11951 return do_strip(self, BOTHSTRIP); /* Common case */
11952 else
11953 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011954}
11955
11956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011957PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011959\n\
11960Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011961If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011962
11963static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011964unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011965{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011966 if (PyTuple_GET_SIZE(args) == 0)
11967 return do_strip(self, LEFTSTRIP); /* Common case */
11968 else
11969 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011970}
11971
11972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011973PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011975\n\
11976Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011977If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011978
11979static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011980unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011981{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011982 if (PyTuple_GET_SIZE(args) == 0)
11983 return do_strip(self, RIGHTSTRIP); /* Common case */
11984 else
11985 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011986}
11987
11988
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011990unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011992 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Georg Brandl222de0f2009-04-12 12:01:50 +000011995 if (len < 1) {
11996 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011997 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
Victor Stinnerc4b49542011-12-11 22:44:26 +010012000 /* no repeat, return original string */
12001 if (len == 1)
12002 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012003
Benjamin Petersonbac79492012-01-14 13:34:47 -050012004 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 return NULL;
12006
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012007 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012008 PyErr_SetString(PyExc_OverflowError,
12009 "repeated string is too long");
12010 return NULL;
12011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012013
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012014 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015 if (!u)
12016 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012017 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 if (PyUnicode_GET_LENGTH(str) == 1) {
12020 const int kind = PyUnicode_KIND(str);
12021 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012022 if (kind == PyUnicode_1BYTE_KIND) {
12023 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012024 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012025 }
12026 else if (kind == PyUnicode_2BYTE_KIND) {
12027 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012028 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012029 ucs2[n] = fill_char;
12030 } else {
12031 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12032 assert(kind == PyUnicode_4BYTE_KIND);
12033 for (n = 0; n < len; ++n)
12034 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 }
12037 else {
12038 /* number of characters copied this far */
12039 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012040 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 char *to = (char *) PyUnicode_DATA(u);
12042 Py_MEMCPY(to, PyUnicode_DATA(str),
12043 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 n = (done <= nchars-done) ? done : nchars-done;
12046 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012047 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 }
12050
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012051 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012052 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053}
12054
Alexander Belopolsky40018472011-02-26 01:02:56 +000012055PyObject *
12056PyUnicode_Replace(PyObject *obj,
12057 PyObject *subobj,
12058 PyObject *replobj,
12059 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060{
12061 PyObject *self;
12062 PyObject *str1;
12063 PyObject *str2;
12064 PyObject *result;
12065
12066 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012067 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012070 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 Py_DECREF(self);
12072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073 }
12074 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012075 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 Py_DECREF(self);
12077 Py_DECREF(str1);
12078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012080 if (PyUnicode_READY(self) == -1 ||
12081 PyUnicode_READY(str1) == -1 ||
12082 PyUnicode_READY(str2) == -1)
12083 result = NULL;
12084 else
12085 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086 Py_DECREF(self);
12087 Py_DECREF(str1);
12088 Py_DECREF(str2);
12089 return result;
12090}
12091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012092PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012093 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094\n\
12095Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012096old replaced by new. If the optional argument count is\n\
12097given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
12099static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 PyObject *str1;
12103 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012104 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 PyObject *result;
12106
Martin v. Löwis18e16552006-02-15 17:27:45 +000012107 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012109 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012112 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 return NULL;
12114 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012115 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 Py_DECREF(str1);
12117 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012118 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012119 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12120 result = NULL;
12121 else
12122 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
12124 Py_DECREF(str1);
12125 Py_DECREF(str2);
12126 return result;
12127}
12128
Alexander Belopolsky40018472011-02-26 01:02:56 +000012129static PyObject *
12130unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012132 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 Py_ssize_t isize;
12134 Py_ssize_t osize, squote, dquote, i, o;
12135 Py_UCS4 max, quote;
12136 int ikind, okind;
12137 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012140 return NULL;
12141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 isize = PyUnicode_GET_LENGTH(unicode);
12143 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 /* Compute length of output, quote characters, and
12146 maximum character */
12147 osize = 2; /* quotes */
12148 max = 127;
12149 squote = dquote = 0;
12150 ikind = PyUnicode_KIND(unicode);
12151 for (i = 0; i < isize; i++) {
12152 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12153 switch (ch) {
12154 case '\'': squote++; osize++; break;
12155 case '"': dquote++; osize++; break;
12156 case '\\': case '\t': case '\r': case '\n':
12157 osize += 2; break;
12158 default:
12159 /* Fast-path ASCII */
12160 if (ch < ' ' || ch == 0x7f)
12161 osize += 4; /* \xHH */
12162 else if (ch < 0x7f)
12163 osize++;
12164 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12165 osize++;
12166 max = ch > max ? ch : max;
12167 }
12168 else if (ch < 0x100)
12169 osize += 4; /* \xHH */
12170 else if (ch < 0x10000)
12171 osize += 6; /* \uHHHH */
12172 else
12173 osize += 10; /* \uHHHHHHHH */
12174 }
12175 }
12176
12177 quote = '\'';
12178 if (squote) {
12179 if (dquote)
12180 /* Both squote and dquote present. Use squote,
12181 and escape them */
12182 osize += squote;
12183 else
12184 quote = '"';
12185 }
12186
12187 repr = PyUnicode_New(osize, max);
12188 if (repr == NULL)
12189 return NULL;
12190 okind = PyUnicode_KIND(repr);
12191 odata = PyUnicode_DATA(repr);
12192
12193 PyUnicode_WRITE(okind, odata, 0, quote);
12194 PyUnicode_WRITE(okind, odata, osize-1, quote);
12195
12196 for (i = 0, o = 1; i < isize; i++) {
12197 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012198
12199 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 if ((ch == quote) || (ch == '\\')) {
12201 PyUnicode_WRITE(okind, odata, o++, '\\');
12202 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012203 continue;
12204 }
12205
Benjamin Peterson29060642009-01-31 22:14:21 +000012206 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012207 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 PyUnicode_WRITE(okind, odata, o++, '\\');
12209 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012210 }
12211 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 PyUnicode_WRITE(okind, odata, o++, '\\');
12213 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012214 }
12215 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 PyUnicode_WRITE(okind, odata, o++, '\\');
12217 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012218 }
12219
12220 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012221 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 PyUnicode_WRITE(okind, odata, o++, '\\');
12223 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012224 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12225 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012226 }
12227
Georg Brandl559e5d72008-06-11 18:37:52 +000012228 /* Copy ASCII characters as-is */
12229 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012231 }
12232
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012234 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012235 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012236 (categories Z* and C* except ASCII space)
12237 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012239 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 if (ch <= 0xff) {
12241 PyUnicode_WRITE(okind, odata, o++, '\\');
12242 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012243 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12244 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012245 }
12246 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 else if (ch >= 0x10000) {
12248 PyUnicode_WRITE(okind, odata, o++, '\\');
12249 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012250 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12251 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12252 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12253 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12254 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12255 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12256 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12257 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012258 }
12259 /* Map 16-bit characters to '\uxxxx' */
12260 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 PyUnicode_WRITE(okind, odata, o++, '\\');
12262 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012263 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12264 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12265 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12266 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012267 }
12268 }
12269 /* Copy characters as-is */
12270 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012272 }
12273 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012276 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012277 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278}
12279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012280PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282\n\
12283Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012284such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285arguments start and end are interpreted as in slice notation.\n\
12286\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012287Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288
12289static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012292 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012293 Py_ssize_t start;
12294 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
Jesus Ceaac451502011-04-20 17:09:23 +020012297 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12298 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 if (PyUnicode_READY(self) == -1)
12302 return NULL;
12303 if (PyUnicode_READY(substring) == -1)
12304 return NULL;
12305
Victor Stinner7931d9a2011-11-04 00:22:48 +010012306 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
12308 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (result == -2)
12311 return NULL;
12312
Christian Heimes217cfd12007-12-02 14:31:20 +000012313 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314}
12315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012316PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012317 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012319Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320
12321static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012324 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012325 Py_ssize_t start;
12326 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012327 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
Jesus Ceaac451502011-04-20 17:09:23 +020012329 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12330 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 if (PyUnicode_READY(self) == -1)
12334 return NULL;
12335 if (PyUnicode_READY(substring) == -1)
12336 return NULL;
12337
Victor Stinner7931d9a2011-11-04 00:22:48 +010012338 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339
12340 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 if (result == -2)
12343 return NULL;
12344
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345 if (result < 0) {
12346 PyErr_SetString(PyExc_ValueError, "substring not found");
12347 return NULL;
12348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349
Christian Heimes217cfd12007-12-02 14:31:20 +000012350 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351}
12352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012353PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012354 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012356Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012357done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
12359static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012360unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012362 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 Py_UCS4 fillchar = ' ';
12364
Victor Stinnere9a29352011-10-01 02:14:59 +020012365 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012367
Benjamin Petersonbac79492012-01-14 13:34:47 -050012368 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369 return NULL;
12370
Victor Stinnerc4b49542011-12-11 22:44:26 +010012371 if (PyUnicode_GET_LENGTH(self) >= width)
12372 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373
Victor Stinnerc4b49542011-12-11 22:44:26 +010012374 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375}
12376
Alexander Belopolsky40018472011-02-26 01:02:56 +000012377PyObject *
12378PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379{
12380 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012381
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 s = PyUnicode_FromObject(s);
12383 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012384 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012385 if (sep != NULL) {
12386 sep = PyUnicode_FromObject(sep);
12387 if (sep == NULL) {
12388 Py_DECREF(s);
12389 return NULL;
12390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391 }
12392
Victor Stinner9310abb2011-10-05 00:59:23 +020012393 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394
12395 Py_DECREF(s);
12396 Py_XDECREF(sep);
12397 return result;
12398}
12399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012400PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402\n\
12403Return a list of the words in S, using sep as the\n\
12404delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012405splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012406whitespace string is a separator and empty strings are\n\
12407removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408
12409static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012410unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411{
12412 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012413 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414
Martin v. Löwis18e16552006-02-15 17:27:45 +000012415 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416 return NULL;
12417
12418 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012421 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012423 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424}
12425
Thomas Wouters477c8d52006-05-27 19:21:47 +000012426PyObject *
12427PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12428{
12429 PyObject* str_obj;
12430 PyObject* sep_obj;
12431 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 int kind1, kind2, kind;
12433 void *buf1 = NULL, *buf2 = NULL;
12434 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012435
12436 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012437 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012439 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012440 if (!sep_obj) {
12441 Py_DECREF(str_obj);
12442 return NULL;
12443 }
12444 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12445 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012446 Py_DECREF(str_obj);
12447 return NULL;
12448 }
12449
Victor Stinner14f8f022011-10-05 20:58:25 +020012450 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012452 kind = Py_MAX(kind1, kind2);
12453 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012455 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 if (!buf1)
12457 goto onError;
12458 buf2 = PyUnicode_DATA(sep_obj);
12459 if (kind2 != kind)
12460 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12461 if (!buf2)
12462 goto onError;
12463 len1 = PyUnicode_GET_LENGTH(str_obj);
12464 len2 = PyUnicode_GET_LENGTH(sep_obj);
12465
Benjamin Petersonead6b532011-12-20 17:23:42 -060012466 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012468 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12469 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12470 else
12471 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 break;
12473 case PyUnicode_2BYTE_KIND:
12474 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12475 break;
12476 case PyUnicode_4BYTE_KIND:
12477 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12478 break;
12479 default:
12480 assert(0);
12481 out = 0;
12482 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012483
12484 Py_DECREF(sep_obj);
12485 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 if (kind1 != kind)
12487 PyMem_Free(buf1);
12488 if (kind2 != kind)
12489 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012490
12491 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 onError:
12493 Py_DECREF(sep_obj);
12494 Py_DECREF(str_obj);
12495 if (kind1 != kind && buf1)
12496 PyMem_Free(buf1);
12497 if (kind2 != kind && buf2)
12498 PyMem_Free(buf2);
12499 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012500}
12501
12502
12503PyObject *
12504PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12505{
12506 PyObject* str_obj;
12507 PyObject* sep_obj;
12508 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 int kind1, kind2, kind;
12510 void *buf1 = NULL, *buf2 = NULL;
12511 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012512
12513 str_obj = PyUnicode_FromObject(str_in);
12514 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012515 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012516 sep_obj = PyUnicode_FromObject(sep_in);
12517 if (!sep_obj) {
12518 Py_DECREF(str_obj);
12519 return NULL;
12520 }
12521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 kind1 = PyUnicode_KIND(str_in);
12523 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012524 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 buf1 = PyUnicode_DATA(str_in);
12526 if (kind1 != kind)
12527 buf1 = _PyUnicode_AsKind(str_in, kind);
12528 if (!buf1)
12529 goto onError;
12530 buf2 = PyUnicode_DATA(sep_obj);
12531 if (kind2 != kind)
12532 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12533 if (!buf2)
12534 goto onError;
12535 len1 = PyUnicode_GET_LENGTH(str_obj);
12536 len2 = PyUnicode_GET_LENGTH(sep_obj);
12537
Benjamin Petersonead6b532011-12-20 17:23:42 -060012538 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012540 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12541 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12542 else
12543 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 break;
12545 case PyUnicode_2BYTE_KIND:
12546 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12547 break;
12548 case PyUnicode_4BYTE_KIND:
12549 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12550 break;
12551 default:
12552 assert(0);
12553 out = 0;
12554 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012555
12556 Py_DECREF(sep_obj);
12557 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 if (kind1 != kind)
12559 PyMem_Free(buf1);
12560 if (kind2 != kind)
12561 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012562
12563 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 onError:
12565 Py_DECREF(sep_obj);
12566 Py_DECREF(str_obj);
12567 if (kind1 != kind && buf1)
12568 PyMem_Free(buf1);
12569 if (kind2 != kind && buf2)
12570 PyMem_Free(buf2);
12571 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012572}
12573
12574PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012576\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012577Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012578the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012579found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012580
12581static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012582unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012583{
Victor Stinner9310abb2011-10-05 00:59:23 +020012584 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012585}
12586
12587PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012588 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012589\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012590Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012591the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012592separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012593
12594static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012595unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012596{
Victor Stinner9310abb2011-10-05 00:59:23 +020012597 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012598}
12599
Alexander Belopolsky40018472011-02-26 01:02:56 +000012600PyObject *
12601PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012602{
12603 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012604
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012605 s = PyUnicode_FromObject(s);
12606 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012607 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 if (sep != NULL) {
12609 sep = PyUnicode_FromObject(sep);
12610 if (sep == NULL) {
12611 Py_DECREF(s);
12612 return NULL;
12613 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012614 }
12615
Victor Stinner9310abb2011-10-05 00:59:23 +020012616 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012617
12618 Py_DECREF(s);
12619 Py_XDECREF(sep);
12620 return result;
12621}
12622
12623PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012624 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012625\n\
12626Return a list of the words in S, using sep as the\n\
12627delimiter string, starting at the end of the string and\n\
12628working to the front. If maxsplit is given, at most maxsplit\n\
12629splits are done. If sep is not specified, any whitespace string\n\
12630is a separator.");
12631
12632static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012633unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012634{
12635 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012636 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012637
Martin v. Löwis18e16552006-02-15 17:27:45 +000012638 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012639 return NULL;
12640
12641 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012643 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012644 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012645 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012646 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012647}
12648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012649PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651\n\
12652Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012653Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012654is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655
12656static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012657unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012659 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012660 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012662 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12663 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664 return NULL;
12665
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012666 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667}
12668
12669static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012670PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012672 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673}
12674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012675PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677\n\
12678Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012679and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680
12681static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012682unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012684 if (PyUnicode_READY(self) == -1)
12685 return NULL;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012686 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687}
12688
Georg Brandlceee0772007-11-27 23:48:05 +000012689PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012691\n\
12692Return a translation table usable for str.translate().\n\
12693If there is only one argument, it must be a dictionary mapping Unicode\n\
12694ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012695Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012696If there are two arguments, they must be strings of equal length, and\n\
12697in the resulting dictionary, each character in x will be mapped to the\n\
12698character at the same position in y. If there is a third argument, it\n\
12699must be a string, whose characters will be mapped to None in the result.");
12700
12701static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012702unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012703{
12704 PyObject *x, *y = NULL, *z = NULL;
12705 PyObject *new = NULL, *key, *value;
12706 Py_ssize_t i = 0;
12707 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012708
Georg Brandlceee0772007-11-27 23:48:05 +000012709 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12710 return NULL;
12711 new = PyDict_New();
12712 if (!new)
12713 return NULL;
12714 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 int x_kind, y_kind, z_kind;
12716 void *x_data, *y_data, *z_data;
12717
Georg Brandlceee0772007-11-27 23:48:05 +000012718 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012719 if (!PyUnicode_Check(x)) {
12720 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12721 "be a string if there is a second argument");
12722 goto err;
12723 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012725 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12726 "arguments must have equal length");
12727 goto err;
12728 }
12729 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 x_kind = PyUnicode_KIND(x);
12731 y_kind = PyUnicode_KIND(y);
12732 x_data = PyUnicode_DATA(x);
12733 y_data = PyUnicode_DATA(y);
12734 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12735 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012736 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012737 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012738 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012739 if (!value) {
12740 Py_DECREF(key);
12741 goto err;
12742 }
Georg Brandlceee0772007-11-27 23:48:05 +000012743 res = PyDict_SetItem(new, key, value);
12744 Py_DECREF(key);
12745 Py_DECREF(value);
12746 if (res < 0)
12747 goto err;
12748 }
12749 /* create entries for deleting chars in z */
12750 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 z_kind = PyUnicode_KIND(z);
12752 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012753 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012755 if (!key)
12756 goto err;
12757 res = PyDict_SetItem(new, key, Py_None);
12758 Py_DECREF(key);
12759 if (res < 0)
12760 goto err;
12761 }
12762 }
12763 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 int kind;
12765 void *data;
12766
Georg Brandlceee0772007-11-27 23:48:05 +000012767 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012768 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012769 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12770 "to maketrans it must be a dict");
12771 goto err;
12772 }
12773 /* copy entries into the new dict, converting string keys to int keys */
12774 while (PyDict_Next(x, &i, &key, &value)) {
12775 if (PyUnicode_Check(key)) {
12776 /* convert string keys to integer keys */
12777 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012778 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012779 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12780 "table must be of length 1");
12781 goto err;
12782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 kind = PyUnicode_KIND(key);
12784 data = PyUnicode_DATA(key);
12785 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012786 if (!newkey)
12787 goto err;
12788 res = PyDict_SetItem(new, newkey, value);
12789 Py_DECREF(newkey);
12790 if (res < 0)
12791 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012792 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012793 /* just keep integer keys */
12794 if (PyDict_SetItem(new, key, value) < 0)
12795 goto err;
12796 } else {
12797 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12798 "be strings or integers");
12799 goto err;
12800 }
12801 }
12802 }
12803 return new;
12804 err:
12805 Py_DECREF(new);
12806 return NULL;
12807}
12808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012809PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811\n\
12812Return a copy of the string S, where all characters have been mapped\n\
12813through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012814Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012815Unmapped characters are left untouched. Characters mapped to None\n\
12816are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
12818static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822}
12823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012824PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012827Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828
12829static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012830unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012832 if (PyUnicode_READY(self) == -1)
12833 return NULL;
12834 if (PyUnicode_IS_ASCII(self))
12835 return ascii_upper_or_lower(self, 0);
12836 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837}
12838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012839PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012842Pad a numeric string S with zeros on the left, to fill a field\n\
12843of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844
12845static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012846unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012848 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012849 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012850 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 int kind;
12852 void *data;
12853 Py_UCS4 chr;
12854
Martin v. Löwis18e16552006-02-15 17:27:45 +000012855 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856 return NULL;
12857
Benjamin Petersonbac79492012-01-14 13:34:47 -050012858 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860
Victor Stinnerc4b49542011-12-11 22:44:26 +010012861 if (PyUnicode_GET_LENGTH(self) >= width)
12862 return unicode_result_unchanged(self);
12863
12864 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865
12866 u = pad(self, fill, 0, '0');
12867
Walter Dörwald068325e2002-04-15 13:36:47 +000012868 if (u == NULL)
12869 return NULL;
12870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 kind = PyUnicode_KIND(u);
12872 data = PyUnicode_DATA(u);
12873 chr = PyUnicode_READ(kind, data, fill);
12874
12875 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 PyUnicode_WRITE(kind, data, 0, chr);
12878 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879 }
12880
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012881 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012882 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884
12885#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012886static PyObject *
12887unicode__decimal2ascii(PyObject *self)
12888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012890}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891#endif
12892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012893PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012894 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012896Return True if S starts with the specified prefix, False otherwise.\n\
12897With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012898With optional end, stop comparing S at that position.\n\
12899prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900
12901static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012902unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012903 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012905 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012906 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012907 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012908 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012909 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910
Jesus Ceaac451502011-04-20 17:09:23 +020012911 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012913 if (PyTuple_Check(subobj)) {
12914 Py_ssize_t i;
12915 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012916 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012917 if (substring == NULL)
12918 return NULL;
12919 result = tailmatch(self, substring, start, end, -1);
12920 Py_DECREF(substring);
12921 if (result) {
12922 Py_RETURN_TRUE;
12923 }
12924 }
12925 /* nothing matched */
12926 Py_RETURN_FALSE;
12927 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012928 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012929 if (substring == NULL) {
12930 if (PyErr_ExceptionMatches(PyExc_TypeError))
12931 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12932 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012933 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012934 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012935 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012936 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012937 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938}
12939
12940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012941PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012944Return True if S ends with the specified suffix, False otherwise.\n\
12945With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012946With optional end, stop comparing S at that position.\n\
12947suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948
12949static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012950unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012951 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012953 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012954 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012955 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012956 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012957 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958
Jesus Ceaac451502011-04-20 17:09:23 +020012959 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012960 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012961 if (PyTuple_Check(subobj)) {
12962 Py_ssize_t i;
12963 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012964 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012965 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012966 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012967 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012968 result = tailmatch(self, substring, start, end, +1);
12969 Py_DECREF(substring);
12970 if (result) {
12971 Py_RETURN_TRUE;
12972 }
12973 }
12974 Py_RETURN_FALSE;
12975 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012976 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012977 if (substring == NULL) {
12978 if (PyErr_ExceptionMatches(PyExc_TypeError))
12979 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12980 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012982 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012983 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012985 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986}
12987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012989
12990PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012992\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012993Return a formatted version of S, using substitutions from args and kwargs.\n\
12994The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012995
Eric Smith27bbca62010-11-04 17:06:58 +000012996PyDoc_STRVAR(format_map__doc__,
12997 "S.format_map(mapping) -> str\n\
12998\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012999Return a formatted version of S, using substitutions from mapping.\n\
13000The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013001
Eric Smith4a7d76d2008-05-30 18:10:19 +000013002static PyObject *
13003unicode__format__(PyObject* self, PyObject* args)
13004{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013005 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013006
13007 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13008 return NULL;
13009
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013010 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013012 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013013}
13014
Eric Smith8c663262007-08-25 02:26:07 +000013015PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013017\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013018Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013019
13020static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013021unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 Py_ssize_t size;
13024
13025 /* If it's a compact object, account for base structure +
13026 character data. */
13027 if (PyUnicode_IS_COMPACT_ASCII(v))
13028 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13029 else if (PyUnicode_IS_COMPACT(v))
13030 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013031 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 else {
13033 /* If it is a two-block object, account for base object, and
13034 for character block if present. */
13035 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013036 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013038 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 }
13040 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013041 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013042 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013044 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013045 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046
13047 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013048}
13049
13050PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013052
13053static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013054unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013055{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013056 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 if (!copy)
13058 return NULL;
13059 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013060}
13061
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062static PyMethodDef unicode_methods[] = {
13063
13064 /* Order is according to common usage: often used methods should
13065 appear first, since lookup is done sequentially. */
13066
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013067 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013068 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13069 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013070 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013071 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13072 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013073 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013074 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13075 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13076 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13077 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13078 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013079 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013080 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13081 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13082 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013083 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013084 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13085 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13086 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013087 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013088 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013089 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013090 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013091 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13092 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13093 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13094 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13095 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13096 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13097 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13098 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13099 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13100 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13101 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13102 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13103 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13104 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013105 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013106 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013107 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013108 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013109 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013110 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013111 {"maketrans", (PyCFunction) unicode_maketrans,
13112 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013113 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013114#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013115 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013116 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117#endif
13118
Benjamin Peterson14339b62009-01-31 16:36:08 +000013119 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120 {NULL, NULL}
13121};
13122
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013123static PyObject *
13124unicode_mod(PyObject *v, PyObject *w)
13125{
Brian Curtindfc80e32011-08-10 20:28:54 -050013126 if (!PyUnicode_Check(v))
13127 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013129}
13130
13131static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013132 0, /*nb_add*/
13133 0, /*nb_subtract*/
13134 0, /*nb_multiply*/
13135 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013136};
13137
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013139 (lenfunc) unicode_length, /* sq_length */
13140 PyUnicode_Concat, /* sq_concat */
13141 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13142 (ssizeargfunc) unicode_getitem, /* sq_item */
13143 0, /* sq_slice */
13144 0, /* sq_ass_item */
13145 0, /* sq_ass_slice */
13146 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147};
13148
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013149static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013150unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 if (PyUnicode_READY(self) == -1)
13153 return NULL;
13154
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013155 if (PyIndex_Check(item)) {
13156 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013157 if (i == -1 && PyErr_Occurred())
13158 return NULL;
13159 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013161 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013162 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013163 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013164 PyObject *result;
13165 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013166 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013167 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013170 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013171 return NULL;
13172 }
13173
13174 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013175 Py_INCREF(unicode_empty);
13176 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013178 slicelength == PyUnicode_GET_LENGTH(self)) {
13179 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013180 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013181 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013182 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013183 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013184 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013185 src_kind = PyUnicode_KIND(self);
13186 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013187 if (!PyUnicode_IS_ASCII(self)) {
13188 kind_limit = kind_maxchar_limit(src_kind);
13189 max_char = 0;
13190 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13191 ch = PyUnicode_READ(src_kind, src_data, cur);
13192 if (ch > max_char) {
13193 max_char = ch;
13194 if (max_char >= kind_limit)
13195 break;
13196 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013197 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013198 }
Victor Stinner55c99112011-10-13 01:17:06 +020013199 else
13200 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013201 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013202 if (result == NULL)
13203 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013204 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013205 dest_data = PyUnicode_DATA(result);
13206
13207 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013208 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13209 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013210 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013211 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013212 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013213 } else {
13214 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13215 return NULL;
13216 }
13217}
13218
13219static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013220 (lenfunc)unicode_length, /* mp_length */
13221 (binaryfunc)unicode_subscript, /* mp_subscript */
13222 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013223};
13224
Guido van Rossumd57fd912000-03-10 22:53:23 +000013225
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226/* Helpers for PyUnicode_Format() */
13227
13228static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013229getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013231 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013232 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013233 (*p_argidx)++;
13234 if (arglen < 0)
13235 return args;
13236 else
13237 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238 }
13239 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013240 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241 return NULL;
13242}
13243
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013244/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013246static PyObject *
13247formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013249 char *p;
13250 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013252
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253 x = PyFloat_AsDouble(v);
13254 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013255 return NULL;
13256
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013258 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013259
Eric Smith0923d1d2009-04-16 20:16:10 +000013260 p = PyOS_double_to_string(x, type, prec,
13261 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013262 if (p == NULL)
13263 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013265 PyMem_Free(p);
13266 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267}
13268
Tim Peters38fd5b62000-09-21 05:43:11 +000013269static PyObject*
13270formatlong(PyObject *val, int flags, int prec, int type)
13271{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013272 char *buf;
13273 int len;
13274 PyObject *str; /* temporary string object. */
13275 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013276
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13278 if (!str)
13279 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013281 Py_DECREF(str);
13282 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013283}
13284
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013285static Py_UCS4
13286formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013288 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013289 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013291 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 goto onError;
13294 }
13295 else {
13296 /* Integer input truncated to a character */
13297 long x;
13298 x = PyLong_AsLong(v);
13299 if (x == -1 && PyErr_Occurred())
13300 goto onError;
13301
Victor Stinner8faf8212011-12-08 22:14:11 +010013302 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 PyErr_SetString(PyExc_OverflowError,
13304 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013305 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013306 }
13307
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013308 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013309 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013310
Benjamin Peterson29060642009-01-31 22:14:21 +000013311 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013312 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013314 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315}
13316
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013317static int
13318repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13319{
13320 int r;
13321 assert(count > 0);
13322 assert(PyUnicode_Check(obj));
13323 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013324 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013325 if (repeated == NULL)
13326 return -1;
13327 r = _PyAccu_Accumulate(acc, repeated);
13328 Py_DECREF(repeated);
13329 return r;
13330 }
13331 else {
13332 do {
13333 if (_PyAccu_Accumulate(acc, obj))
13334 return -1;
13335 } while (--count);
13336 return 0;
13337 }
13338}
13339
Alexander Belopolsky40018472011-02-26 01:02:56 +000013340PyObject *
13341PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013343 void *fmt;
13344 int fmtkind;
13345 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013347 int r;
13348 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013351 PyObject *temp = NULL;
13352 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013353 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013354 _PyAccu acc;
13355 static PyObject *plus, *minus, *blank, *zero, *percent;
13356
13357 if (!plus && !(plus = get_latin1_char('+')))
13358 return NULL;
13359 if (!minus && !(minus = get_latin1_char('-')))
13360 return NULL;
13361 if (!blank && !(blank = get_latin1_char(' ')))
13362 return NULL;
13363 if (!zero && !(zero = get_latin1_char('0')))
13364 return NULL;
13365 if (!percent && !(percent = get_latin1_char('%')))
13366 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013367
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013369 PyErr_BadInternalCall();
13370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013372 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013373 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013374 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013375 if (PyUnicode_READY(uformat) == -1)
13376 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013377 if (_PyAccu_Init(&acc))
13378 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013379 fmt = PyUnicode_DATA(uformat);
13380 fmtkind = PyUnicode_KIND(uformat);
13381 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13382 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013385 arglen = PyTuple_Size(args);
13386 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387 }
13388 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 arglen = -1;
13390 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013392 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013393 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395
13396 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013397 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013398 PyObject *nonfmt;
13399 Py_ssize_t nonfmtpos;
13400 nonfmtpos = fmtpos++;
13401 while (fmtcnt >= 0 &&
13402 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13403 fmtpos++;
13404 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013405 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013406 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013407 if (nonfmt == NULL)
13408 goto onError;
13409 r = _PyAccu_Accumulate(&acc, nonfmt);
13410 Py_DECREF(nonfmt);
13411 if (r)
13412 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013413 }
13414 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 /* Got a format specifier */
13416 int flags = 0;
13417 Py_ssize_t width = -1;
13418 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013420 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 int isnumok;
13422 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013423 void *pbuf = NULL;
13424 Py_ssize_t pindex, len;
13425 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013427 fmtpos++;
13428 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13429 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 Py_ssize_t keylen;
13431 PyObject *key;
13432 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013433
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 if (dict == NULL) {
13435 PyErr_SetString(PyExc_TypeError,
13436 "format requires a mapping");
13437 goto onError;
13438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013439 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013441 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 /* Skip over balanced parentheses */
13443 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013444 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013446 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013448 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 if (fmtcnt < 0 || pcount > 0) {
13452 PyErr_SetString(PyExc_ValueError,
13453 "incomplete format key");
13454 goto onError;
13455 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013456 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013457 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 if (key == NULL)
13459 goto onError;
13460 if (args_owned) {
13461 Py_DECREF(args);
13462 args_owned = 0;
13463 }
13464 args = PyObject_GetItem(dict, key);
13465 Py_DECREF(key);
13466 if (args == NULL) {
13467 goto onError;
13468 }
13469 args_owned = 1;
13470 arglen = -1;
13471 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013472 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013474 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 case '-': flags |= F_LJUST; continue;
13476 case '+': flags |= F_SIGN; continue;
13477 case ' ': flags |= F_BLANK; continue;
13478 case '#': flags |= F_ALT; continue;
13479 case '0': flags |= F_ZERO; continue;
13480 }
13481 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013482 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013483 if (c == '*') {
13484 v = getnextarg(args, arglen, &argidx);
13485 if (v == NULL)
13486 goto onError;
13487 if (!PyLong_Check(v)) {
13488 PyErr_SetString(PyExc_TypeError,
13489 "* wants int");
13490 goto onError;
13491 }
13492 width = PyLong_AsLong(v);
13493 if (width == -1 && PyErr_Occurred())
13494 goto onError;
13495 if (width < 0) {
13496 flags |= F_LJUST;
13497 width = -width;
13498 }
13499 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 }
13502 else if (c >= '0' && c <= '9') {
13503 width = c - '0';
13504 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 if (c < '0' || c > '9')
13507 break;
13508 if ((width*10) / 10 != width) {
13509 PyErr_SetString(PyExc_ValueError,
13510 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013511 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 }
13513 width = width*10 + (c - '0');
13514 }
13515 }
13516 if (c == '.') {
13517 prec = 0;
13518 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013519 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 if (c == '*') {
13521 v = getnextarg(args, arglen, &argidx);
13522 if (v == NULL)
13523 goto onError;
13524 if (!PyLong_Check(v)) {
13525 PyErr_SetString(PyExc_TypeError,
13526 "* wants int");
13527 goto onError;
13528 }
13529 prec = PyLong_AsLong(v);
13530 if (prec == -1 && PyErr_Occurred())
13531 goto onError;
13532 if (prec < 0)
13533 prec = 0;
13534 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013535 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013536 }
13537 else if (c >= '0' && c <= '9') {
13538 prec = c - '0';
13539 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013540 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 if (c < '0' || c > '9')
13542 break;
13543 if ((prec*10) / 10 != prec) {
13544 PyErr_SetString(PyExc_ValueError,
13545 "prec too big");
13546 goto onError;
13547 }
13548 prec = prec*10 + (c - '0');
13549 }
13550 }
13551 } /* prec */
13552 if (fmtcnt >= 0) {
13553 if (c == 'h' || c == 'l' || c == 'L') {
13554 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 }
13557 }
13558 if (fmtcnt < 0) {
13559 PyErr_SetString(PyExc_ValueError,
13560 "incomplete format");
13561 goto onError;
13562 }
13563 if (c != '%') {
13564 v = getnextarg(args, arglen, &argidx);
13565 if (v == NULL)
13566 goto onError;
13567 }
13568 sign = 0;
13569 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013570 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 switch (c) {
13572
13573 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013574 _PyAccu_Accumulate(&acc, percent);
13575 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013576
13577 case 's':
13578 case 'r':
13579 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013580 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 temp = v;
13582 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013583 }
13584 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 if (c == 's')
13586 temp = PyObject_Str(v);
13587 else if (c == 'r')
13588 temp = PyObject_Repr(v);
13589 else
13590 temp = PyObject_ASCII(v);
13591 if (temp == NULL)
13592 goto onError;
13593 if (PyUnicode_Check(temp))
13594 /* nothing to do */;
13595 else {
13596 Py_DECREF(temp);
13597 PyErr_SetString(PyExc_TypeError,
13598 "%s argument has non-string str()");
13599 goto onError;
13600 }
13601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602 if (PyUnicode_READY(temp) == -1) {
13603 Py_CLEAR(temp);
13604 goto onError;
13605 }
13606 pbuf = PyUnicode_DATA(temp);
13607 kind = PyUnicode_KIND(temp);
13608 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 if (prec >= 0 && len > prec)
13610 len = prec;
13611 break;
13612
13613 case 'i':
13614 case 'd':
13615 case 'u':
13616 case 'o':
13617 case 'x':
13618 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013619 isnumok = 0;
13620 if (PyNumber_Check(v)) {
13621 PyObject *iobj=NULL;
13622
13623 if (PyLong_Check(v)) {
13624 iobj = v;
13625 Py_INCREF(iobj);
13626 }
13627 else {
13628 iobj = PyNumber_Long(v);
13629 }
13630 if (iobj!=NULL) {
13631 if (PyLong_Check(iobj)) {
13632 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013633 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013634 Py_DECREF(iobj);
13635 if (!temp)
13636 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637 if (PyUnicode_READY(temp) == -1) {
13638 Py_CLEAR(temp);
13639 goto onError;
13640 }
13641 pbuf = PyUnicode_DATA(temp);
13642 kind = PyUnicode_KIND(temp);
13643 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 sign = 1;
13645 }
13646 else {
13647 Py_DECREF(iobj);
13648 }
13649 }
13650 }
13651 if (!isnumok) {
13652 PyErr_Format(PyExc_TypeError,
13653 "%%%c format: a number is required, "
13654 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13655 goto onError;
13656 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013657 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013658 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013659 fillobj = zero;
13660 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013661 break;
13662
13663 case 'e':
13664 case 'E':
13665 case 'f':
13666 case 'F':
13667 case 'g':
13668 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013669 temp = formatfloat(v, flags, prec, c);
13670 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013672 if (PyUnicode_READY(temp) == -1) {
13673 Py_CLEAR(temp);
13674 goto onError;
13675 }
13676 pbuf = PyUnicode_DATA(temp);
13677 kind = PyUnicode_KIND(temp);
13678 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013679 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013680 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013682 fillobj = zero;
13683 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013684 break;
13685
13686 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013687 {
13688 Py_UCS4 ch = formatchar(v);
13689 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013690 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013691 temp = _PyUnicode_FromUCS4(&ch, 1);
13692 if (temp == NULL)
13693 goto onError;
13694 pbuf = PyUnicode_DATA(temp);
13695 kind = PyUnicode_KIND(temp);
13696 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013698 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013699
13700 default:
13701 PyErr_Format(PyExc_ValueError,
13702 "unsupported format character '%c' (0x%x) "
13703 "at index %zd",
13704 (31<=c && c<=126) ? (char)c : '?',
13705 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013706 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013707 goto onError;
13708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013709 /* pbuf is initialized here. */
13710 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013711 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013712 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13713 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013715 pindex++;
13716 }
13717 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13718 signobj = plus;
13719 len--;
13720 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 }
13722 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013723 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013724 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013725 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 else
13727 sign = 0;
13728 }
13729 if (width < len)
13730 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013731 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013732 if (fill != ' ') {
13733 assert(signobj != NULL);
13734 if (_PyAccu_Accumulate(&acc, signobj))
13735 goto onError;
13736 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013737 if (width > len)
13738 width--;
13739 }
13740 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013741 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013742 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013744 second = get_latin1_char(
13745 PyUnicode_READ(kind, pbuf, pindex + 1));
13746 pindex += 2;
13747 if (second == NULL ||
13748 _PyAccu_Accumulate(&acc, zero) ||
13749 _PyAccu_Accumulate(&acc, second))
13750 goto onError;
13751 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013753 width -= 2;
13754 if (width < 0)
13755 width = 0;
13756 len -= 2;
13757 }
13758 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013759 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013760 if (repeat_accumulate(&acc, fillobj, width - len))
13761 goto onError;
13762 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013763 }
13764 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013765 if (sign) {
13766 assert(signobj != NULL);
13767 if (_PyAccu_Accumulate(&acc, signobj))
13768 goto onError;
13769 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013770 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013771 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13772 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013773 second = get_latin1_char(
13774 PyUnicode_READ(kind, pbuf, pindex + 1));
13775 pindex += 2;
13776 if (second == NULL ||
13777 _PyAccu_Accumulate(&acc, zero) ||
13778 _PyAccu_Accumulate(&acc, second))
13779 goto onError;
13780 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013781 }
13782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013783 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013784 if (temp != NULL) {
13785 assert(pbuf == PyUnicode_DATA(temp));
13786 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013788 else {
13789 const char *p = (const char *) pbuf;
13790 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013791 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013792 v = PyUnicode_FromKindAndData(kind, p, len);
13793 }
13794 if (v == NULL)
13795 goto onError;
13796 r = _PyAccu_Accumulate(&acc, v);
13797 Py_DECREF(v);
13798 if (r)
13799 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013800 if (width > len && repeat_accumulate(&acc, blank, width - len))
13801 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013802 if (dict && (argidx < arglen) && c != '%') {
13803 PyErr_SetString(PyExc_TypeError,
13804 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 goto onError;
13806 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013807 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013809 } /* until end */
13810 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 PyErr_SetString(PyExc_TypeError,
13812 "not all arguments converted during string formatting");
13813 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013814 }
13815
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013816 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013817 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013818 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819 }
13820 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013821 Py_XDECREF(temp);
13822 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013823 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013824
Benjamin Peterson29060642009-01-31 22:14:21 +000013825 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013826 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013827 Py_XDECREF(temp);
13828 Py_XDECREF(second);
13829 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832 }
13833 return NULL;
13834}
13835
Jeremy Hylton938ace62002-07-17 16:30:39 +000013836static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013837unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13838
Tim Peters6d6c1a32001-08-02 04:15:00 +000013839static PyObject *
13840unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13841{
Benjamin Peterson29060642009-01-31 22:14:21 +000013842 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013843 static char *kwlist[] = {"object", "encoding", "errors", 0};
13844 char *encoding = NULL;
13845 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013846
Benjamin Peterson14339b62009-01-31 16:36:08 +000013847 if (type != &PyUnicode_Type)
13848 return unicode_subtype_new(type, args, kwds);
13849 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013851 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013852 if (x == NULL) {
13853 Py_INCREF(unicode_empty);
13854 return unicode_empty;
13855 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013856 if (encoding == NULL && errors == NULL)
13857 return PyObject_Str(x);
13858 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013859 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013860}
13861
Guido van Rossume023fe02001-08-30 03:12:59 +000013862static PyObject *
13863unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13864{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013865 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013866 Py_ssize_t length, char_size;
13867 int share_wstr, share_utf8;
13868 unsigned int kind;
13869 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013870
Benjamin Peterson14339b62009-01-31 16:36:08 +000013871 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013872
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013873 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013874 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013875 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013876 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013877 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013878 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013879 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013880 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013881
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013882 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013883 if (self == NULL) {
13884 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013885 return NULL;
13886 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013887 kind = PyUnicode_KIND(unicode);
13888 length = PyUnicode_GET_LENGTH(unicode);
13889
13890 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013891#ifdef Py_DEBUG
13892 _PyUnicode_HASH(self) = -1;
13893#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013894 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013895#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013896 _PyUnicode_STATE(self).interned = 0;
13897 _PyUnicode_STATE(self).kind = kind;
13898 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013899 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013900 _PyUnicode_STATE(self).ready = 1;
13901 _PyUnicode_WSTR(self) = NULL;
13902 _PyUnicode_UTF8_LENGTH(self) = 0;
13903 _PyUnicode_UTF8(self) = NULL;
13904 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013905 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013906
13907 share_utf8 = 0;
13908 share_wstr = 0;
13909 if (kind == PyUnicode_1BYTE_KIND) {
13910 char_size = 1;
13911 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13912 share_utf8 = 1;
13913 }
13914 else if (kind == PyUnicode_2BYTE_KIND) {
13915 char_size = 2;
13916 if (sizeof(wchar_t) == 2)
13917 share_wstr = 1;
13918 }
13919 else {
13920 assert(kind == PyUnicode_4BYTE_KIND);
13921 char_size = 4;
13922 if (sizeof(wchar_t) == 4)
13923 share_wstr = 1;
13924 }
13925
13926 /* Ensure we won't overflow the length. */
13927 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13928 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013929 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013930 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013931 data = PyObject_MALLOC((length + 1) * char_size);
13932 if (data == NULL) {
13933 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013934 goto onError;
13935 }
13936
Victor Stinnerc3c74152011-10-02 20:39:55 +020013937 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013938 if (share_utf8) {
13939 _PyUnicode_UTF8_LENGTH(self) = length;
13940 _PyUnicode_UTF8(self) = data;
13941 }
13942 if (share_wstr) {
13943 _PyUnicode_WSTR_LENGTH(self) = length;
13944 _PyUnicode_WSTR(self) = (wchar_t *)data;
13945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013946
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013947 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013948 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013949 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013950#ifdef Py_DEBUG
13951 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13952#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013953 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013954 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013955
13956onError:
13957 Py_DECREF(unicode);
13958 Py_DECREF(self);
13959 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013960}
13961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013962PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013963 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013964\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013965Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013966encoding defaults to the current default string encoding.\n\
13967errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013968
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013969static PyObject *unicode_iter(PyObject *seq);
13970
Guido van Rossumd57fd912000-03-10 22:53:23 +000013971PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013972 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 "str", /* tp_name */
13974 sizeof(PyUnicodeObject), /* tp_size */
13975 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013976 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013977 (destructor)unicode_dealloc, /* tp_dealloc */
13978 0, /* tp_print */
13979 0, /* tp_getattr */
13980 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013981 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013982 unicode_repr, /* tp_repr */
13983 &unicode_as_number, /* tp_as_number */
13984 &unicode_as_sequence, /* tp_as_sequence */
13985 &unicode_as_mapping, /* tp_as_mapping */
13986 (hashfunc) unicode_hash, /* tp_hash*/
13987 0, /* tp_call*/
13988 (reprfunc) unicode_str, /* tp_str */
13989 PyObject_GenericGetAttr, /* tp_getattro */
13990 0, /* tp_setattro */
13991 0, /* tp_as_buffer */
13992 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013993 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 unicode_doc, /* tp_doc */
13995 0, /* tp_traverse */
13996 0, /* tp_clear */
13997 PyUnicode_RichCompare, /* tp_richcompare */
13998 0, /* tp_weaklistoffset */
13999 unicode_iter, /* tp_iter */
14000 0, /* tp_iternext */
14001 unicode_methods, /* tp_methods */
14002 0, /* tp_members */
14003 0, /* tp_getset */
14004 &PyBaseObject_Type, /* tp_base */
14005 0, /* tp_dict */
14006 0, /* tp_descr_get */
14007 0, /* tp_descr_set */
14008 0, /* tp_dictoffset */
14009 0, /* tp_init */
14010 0, /* tp_alloc */
14011 unicode_new, /* tp_new */
14012 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014013};
14014
14015/* Initialize the Unicode implementation */
14016
Victor Stinner3a50e702011-10-18 21:21:00 +020014017int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014018{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014019 int i;
14020
Thomas Wouters477c8d52006-05-27 19:21:47 +000014021 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014022 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014023 0x000A, /* LINE FEED */
14024 0x000D, /* CARRIAGE RETURN */
14025 0x001C, /* FILE SEPARATOR */
14026 0x001D, /* GROUP SEPARATOR */
14027 0x001E, /* RECORD SEPARATOR */
14028 0x0085, /* NEXT LINE */
14029 0x2028, /* LINE SEPARATOR */
14030 0x2029, /* PARAGRAPH SEPARATOR */
14031 };
14032
Fred Drakee4315f52000-05-09 19:53:39 +000014033 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014034 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014035 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014036 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014037 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014038
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014039 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014040 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014041 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014042 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014043
14044 /* initialize the linebreak bloom filter */
14045 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014046 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014047 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014048
14049 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014050
14051#ifdef HAVE_MBCS
14052 winver.dwOSVersionInfoSize = sizeof(winver);
14053 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14054 PyErr_SetFromWindowsErr(0);
14055 return -1;
14056 }
14057#endif
14058 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059}
14060
14061/* Finalize the Unicode implementation */
14062
Christian Heimesa156e092008-02-16 07:38:31 +000014063int
14064PyUnicode_ClearFreeList(void)
14065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014066 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014067}
14068
Guido van Rossumd57fd912000-03-10 22:53:23 +000014069void
Thomas Wouters78890102000-07-22 19:25:51 +000014070_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014071{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014072 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014073
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014074 Py_XDECREF(unicode_empty);
14075 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014076
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014077 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014078 if (unicode_latin1[i]) {
14079 Py_DECREF(unicode_latin1[i]);
14080 unicode_latin1[i] = NULL;
14081 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014082 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014083 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014084 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014085}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014086
Walter Dörwald16807132007-05-25 13:52:07 +000014087void
14088PyUnicode_InternInPlace(PyObject **p)
14089{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014090 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014091 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014092#ifdef Py_DEBUG
14093 assert(s != NULL);
14094 assert(_PyUnicode_CHECK(s));
14095#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014096 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014097 return;
14098#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 /* If it's a subclass, we don't really know what putting
14100 it in the interned dict might do. */
14101 if (!PyUnicode_CheckExact(s))
14102 return;
14103 if (PyUnicode_CHECK_INTERNED(s))
14104 return;
14105 if (interned == NULL) {
14106 interned = PyDict_New();
14107 if (interned == NULL) {
14108 PyErr_Clear(); /* Don't leave an exception */
14109 return;
14110 }
14111 }
14112 /* It might be that the GetItem call fails even
14113 though the key is present in the dictionary,
14114 namely when this happens during a stack overflow. */
14115 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014116 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014118
Benjamin Peterson29060642009-01-31 22:14:21 +000014119 if (t) {
14120 Py_INCREF(t);
14121 Py_DECREF(*p);
14122 *p = t;
14123 return;
14124 }
Walter Dörwald16807132007-05-25 13:52:07 +000014125
Benjamin Peterson14339b62009-01-31 16:36:08 +000014126 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014127 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014128 PyErr_Clear();
14129 PyThreadState_GET()->recursion_critical = 0;
14130 return;
14131 }
14132 PyThreadState_GET()->recursion_critical = 0;
14133 /* The two references in interned are not counted by refcnt.
14134 The deallocator will take care of this */
14135 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014136 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014137}
14138
14139void
14140PyUnicode_InternImmortal(PyObject **p)
14141{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 PyUnicode_InternInPlace(p);
14143 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014144 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014145 Py_INCREF(*p);
14146 }
Walter Dörwald16807132007-05-25 13:52:07 +000014147}
14148
14149PyObject *
14150PyUnicode_InternFromString(const char *cp)
14151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 PyObject *s = PyUnicode_FromString(cp);
14153 if (s == NULL)
14154 return NULL;
14155 PyUnicode_InternInPlace(&s);
14156 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014157}
14158
Alexander Belopolsky40018472011-02-26 01:02:56 +000014159void
14160_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014161{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014162 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014163 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 Py_ssize_t i, n;
14165 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014166
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 if (interned == NULL || !PyDict_Check(interned))
14168 return;
14169 keys = PyDict_Keys(interned);
14170 if (keys == NULL || !PyList_Check(keys)) {
14171 PyErr_Clear();
14172 return;
14173 }
Walter Dörwald16807132007-05-25 13:52:07 +000014174
Benjamin Peterson14339b62009-01-31 16:36:08 +000014175 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14176 detector, interned unicode strings are not forcibly deallocated;
14177 rather, we give them their stolen references back, and then clear
14178 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014179
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 n = PyList_GET_SIZE(keys);
14181 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014182 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014183 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014184 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014185 if (PyUnicode_READY(s) == -1) {
14186 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014187 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014189 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014190 case SSTATE_NOT_INTERNED:
14191 /* XXX Shouldn't happen */
14192 break;
14193 case SSTATE_INTERNED_IMMORTAL:
14194 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014195 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014196 break;
14197 case SSTATE_INTERNED_MORTAL:
14198 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014199 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014200 break;
14201 default:
14202 Py_FatalError("Inconsistent interned string state.");
14203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014204 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014205 }
14206 fprintf(stderr, "total size of all interned strings: "
14207 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14208 "mortal/immortal\n", mortal_size, immortal_size);
14209 Py_DECREF(keys);
14210 PyDict_Clear(interned);
14211 Py_DECREF(interned);
14212 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014213}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014214
14215
14216/********************* Unicode Iterator **************************/
14217
14218typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014219 PyObject_HEAD
14220 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014221 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014222} unicodeiterobject;
14223
14224static void
14225unicodeiter_dealloc(unicodeiterobject *it)
14226{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014227 _PyObject_GC_UNTRACK(it);
14228 Py_XDECREF(it->it_seq);
14229 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014230}
14231
14232static int
14233unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14234{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014235 Py_VISIT(it->it_seq);
14236 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014237}
14238
14239static PyObject *
14240unicodeiter_next(unicodeiterobject *it)
14241{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014242 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014243
Benjamin Peterson14339b62009-01-31 16:36:08 +000014244 assert(it != NULL);
14245 seq = it->it_seq;
14246 if (seq == NULL)
14247 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014248 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014250 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14251 int kind = PyUnicode_KIND(seq);
14252 void *data = PyUnicode_DATA(seq);
14253 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14254 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014255 if (item != NULL)
14256 ++it->it_index;
14257 return item;
14258 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014259
Benjamin Peterson14339b62009-01-31 16:36:08 +000014260 Py_DECREF(seq);
14261 it->it_seq = NULL;
14262 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014263}
14264
14265static PyObject *
14266unicodeiter_len(unicodeiterobject *it)
14267{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 Py_ssize_t len = 0;
14269 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014270 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014272}
14273
14274PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14275
14276static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014277 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014278 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014279 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014280};
14281
14282PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014283 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14284 "str_iterator", /* tp_name */
14285 sizeof(unicodeiterobject), /* tp_basicsize */
14286 0, /* tp_itemsize */
14287 /* methods */
14288 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14289 0, /* tp_print */
14290 0, /* tp_getattr */
14291 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014292 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014293 0, /* tp_repr */
14294 0, /* tp_as_number */
14295 0, /* tp_as_sequence */
14296 0, /* tp_as_mapping */
14297 0, /* tp_hash */
14298 0, /* tp_call */
14299 0, /* tp_str */
14300 PyObject_GenericGetAttr, /* tp_getattro */
14301 0, /* tp_setattro */
14302 0, /* tp_as_buffer */
14303 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14304 0, /* tp_doc */
14305 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14306 0, /* tp_clear */
14307 0, /* tp_richcompare */
14308 0, /* tp_weaklistoffset */
14309 PyObject_SelfIter, /* tp_iter */
14310 (iternextfunc)unicodeiter_next, /* tp_iternext */
14311 unicodeiter_methods, /* tp_methods */
14312 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014313};
14314
14315static PyObject *
14316unicode_iter(PyObject *seq)
14317{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014318 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014319
Benjamin Peterson14339b62009-01-31 16:36:08 +000014320 if (!PyUnicode_Check(seq)) {
14321 PyErr_BadInternalCall();
14322 return NULL;
14323 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014324 if (PyUnicode_READY(seq) == -1)
14325 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14327 if (it == NULL)
14328 return NULL;
14329 it->it_index = 0;
14330 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014331 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014332 _PyObject_GC_TRACK(it);
14333 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014334}
14335
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014336
14337size_t
14338Py_UNICODE_strlen(const Py_UNICODE *u)
14339{
14340 int res = 0;
14341 while(*u++)
14342 res++;
14343 return res;
14344}
14345
14346Py_UNICODE*
14347Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14348{
14349 Py_UNICODE *u = s1;
14350 while ((*u++ = *s2++));
14351 return s1;
14352}
14353
14354Py_UNICODE*
14355Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14356{
14357 Py_UNICODE *u = s1;
14358 while ((*u++ = *s2++))
14359 if (n-- == 0)
14360 break;
14361 return s1;
14362}
14363
14364Py_UNICODE*
14365Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14366{
14367 Py_UNICODE *u1 = s1;
14368 u1 += Py_UNICODE_strlen(u1);
14369 Py_UNICODE_strcpy(u1, s2);
14370 return s1;
14371}
14372
14373int
14374Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14375{
14376 while (*s1 && *s2 && *s1 == *s2)
14377 s1++, s2++;
14378 if (*s1 && *s2)
14379 return (*s1 < *s2) ? -1 : +1;
14380 if (*s1)
14381 return 1;
14382 if (*s2)
14383 return -1;
14384 return 0;
14385}
14386
14387int
14388Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14389{
14390 register Py_UNICODE u1, u2;
14391 for (; n != 0; n--) {
14392 u1 = *s1;
14393 u2 = *s2;
14394 if (u1 != u2)
14395 return (u1 < u2) ? -1 : +1;
14396 if (u1 == '\0')
14397 return 0;
14398 s1++;
14399 s2++;
14400 }
14401 return 0;
14402}
14403
14404Py_UNICODE*
14405Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14406{
14407 const Py_UNICODE *p;
14408 for (p = s; *p; p++)
14409 if (*p == c)
14410 return (Py_UNICODE*)p;
14411 return NULL;
14412}
14413
14414Py_UNICODE*
14415Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14416{
14417 const Py_UNICODE *p;
14418 p = s + Py_UNICODE_strlen(s);
14419 while (p != s) {
14420 p--;
14421 if (*p == c)
14422 return (Py_UNICODE*)p;
14423 }
14424 return NULL;
14425}
Victor Stinner331ea922010-08-10 16:37:20 +000014426
Victor Stinner71133ff2010-09-01 23:43:53 +000014427Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014428PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014429{
Victor Stinner577db2c2011-10-11 22:12:48 +020014430 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014431 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014433 if (!PyUnicode_Check(unicode)) {
14434 PyErr_BadArgument();
14435 return NULL;
14436 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014437 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014438 if (u == NULL)
14439 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014440 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014441 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014442 PyErr_NoMemory();
14443 return NULL;
14444 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014445 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014446 size *= sizeof(Py_UNICODE);
14447 copy = PyMem_Malloc(size);
14448 if (copy == NULL) {
14449 PyErr_NoMemory();
14450 return NULL;
14451 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014452 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014453 return copy;
14454}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014455
Georg Brandl66c221e2010-10-14 07:04:07 +000014456/* A _string module, to export formatter_parser and formatter_field_name_split
14457 to the string.Formatter class implemented in Python. */
14458
14459static PyMethodDef _string_methods[] = {
14460 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14461 METH_O, PyDoc_STR("split the argument as a field name")},
14462 {"formatter_parser", (PyCFunction) formatter_parser,
14463 METH_O, PyDoc_STR("parse the argument as a format string")},
14464 {NULL, NULL}
14465};
14466
14467static struct PyModuleDef _string_module = {
14468 PyModuleDef_HEAD_INIT,
14469 "_string",
14470 PyDoc_STR("string helper module"),
14471 0,
14472 _string_methods,
14473 NULL,
14474 NULL,
14475 NULL,
14476 NULL
14477};
14478
14479PyMODINIT_FUNC
14480PyInit__string(void)
14481{
14482 return PyModule_Create(&_string_module);
14483}
14484
14485
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014486#ifdef __cplusplus
14487}
14488#endif